diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4fb6d388076f241b8ec8a4331df7ef4612a4b722..b709d57fab731f78fb003ca7297a4afa760767fe 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -49,9 +49,9 @@ docs:
 platform_compatible_tests:
   stage: platform_compatible_tests
   script:
-    - bazel build mace/core:core --define openmp=true
-    - bazel build --config arm_linux_gnueabihf --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
-    - bazel build --config aarch64_linux_gnu --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build mace/core:core --define openmp=false
+    - bazel build --config arm_linux_gnueabihf --define openmp=false --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build --config aarch64_linux_gnu --define openmp=false --define opencl=true --define neon=true //mace/libmace:libmace.so
 
 build_libraries:
   stage: build_libraries
@@ -202,13 +202,13 @@ so_size_check:
   stage: so_size_check
   script:
     - DYNAMIC_LIB_PATH="bazel-bin/mace/libmace/libmace.so"
-    - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=false --define quantize=false --cpu=armeabi-v7a
+    - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=false --define quantize=false --cpu=armeabi-v7a
     - CURRENT_LIBMACE_SO_SIZE=`ls -l $DYNAMIC_LIB_PATH --block-size=K -s | cut -f 1 -d "K"`
     - TARGET_MACE_WORK_DIR=`mktemp -d`
     - pushd $TARGET_MACE_WORK_DIR
     - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace.git
     - pushd mace
-    - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=false --define quantize=false --cpu=armeabi-v7a
+    - bazel build -s --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=false --define quantize=false --cpu=armeabi-v7a
     - TARGET_LIBMACE_SO_SIZE=`ls -l $DYNAMIC_LIB_PATH --block-size=K -s | cut -f 1 -d "K"`
     - popd
     - popd
diff --git a/WORKSPACE b/WORKSPACE
index 524126a41b27444477f67688afc3acf140bad417..daa855de4784d5b968dbc877bc3cc0031f3e455e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -79,19 +79,19 @@ new_http_archive(
 
 http_archive(
     name = "gemmlowp",
-    sha256 = "4e9cd60f7871ae9e06dcea5fec1a98ddf1006b32a85883480273e663f143f303",
-    strip_prefix = "gemmlowp-master-66fb41a7cafd2034a50e0b32791359897d657f7a",
+    sha256 = "afbea037aee2d21b625985238486b4219396f9c2550b0fde3157fab4d2580205",
+    strip_prefix = "gemmlowp-master-1f6d8d442805a400c74e63a4a017390733df2e28",
     urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-master-66fb41a7cafd2034a50e0b32791359897d657f7a.zip",
+        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-master-1f6d8d442805a400c74e63a4a017390733df2e28.zip",
     ],
 )
 
 http_archive(
     name = "tflite",
-    sha256 = "1bb4571ee5cbde427ecfed076b39edaad96ace897ab86bb2495bdb93c706b203",
-    strip_prefix = "tensorflow-mace-ffc8cc7e8c9d1894753509e88b17e251bc6255e3",
+    sha256 = "8b4c1b2ad2d31da9859e17b0ad551b12e1db7ff2faf7e83218901ab48d9fa91a",
+    strip_prefix = "tensorflow-mace-dfabaf85145e4d5ad39f34a0cea57b44c32dbe43",
     urls = [
-        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/tflite/tensorflow-mace-ffc8cc7e8c9d1894753509e88b17e251bc6255e3_custom.zip",
+        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/tflite/tensorflow-mace-dfabaf85145e4d5ad39f34a0cea57b44c32dbe43.zip",
     ],
 )
 
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index bfe0bb848a02990eee9b14ab2421c9ea4012d66d..e0dac730639276dbd30bf210b466c57d9940feaf 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -252,8 +252,7 @@ int Main(int argc, char **argv) {
   MaceEngineConfig config(device_type);
   mace_status = config.SetCPUThreadPolicy(
       FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
-      true);
+      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
   if (mace_status != MaceStatus::MACE_SUCCESS) {
     LOG(INFO) << "Set openmp or cpu affinity failed.";
   }
diff --git a/mace/core/device.cc b/mace/core/device.cc
index 535b7193633cf6881fea54f129c0485ddc3ed585..43f600753c7bc56423b99b04bc277ac84b64c1ce 100644
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -21,10 +21,10 @@ namespace mace {
 
 CPUDevice::CPUDevice(const int num_threads,
                      const CPUAffinityPolicy policy,
-                     const bool use_gemmlowp)
+                     utils::ThreadPool *thread_pool)
     : cpu_runtime_(make_unique<CPURuntime>(num_threads,
                                            policy,
-                                           use_gemmlowp)),
+                                           thread_pool)),
       scratch_buffer_(make_unique<ScratchBuffer>(GetCPUAllocator())) {}
 
 CPUDevice::~CPUDevice() = default;
diff --git a/mace/core/device.h b/mace/core/device.h
index e5fda181ee66e127f953c6f46937481269ccfc16..85019d9485005dc75f5ba19e682022ef237da6b5 100644
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -46,7 +46,7 @@ class CPUDevice : public Device {
  public:
   CPUDevice(const int num_threads,
             const CPUAffinityPolicy policy,
-            const bool use_gemmlowp);
+            utils::ThreadPool *thread_pool);
   virtual ~CPUDevice();
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/core/net.cc b/mace/core/net.cc
index fbe1c1b8b9da81929732a77c176195f29dd688b9..a10d96bb560b2a145146bcffa88e2b4e045f0e10 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -136,7 +136,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
           make_unique<CPUDevice>(
               target_device->cpu_runtime()->num_threads(),
               target_device->cpu_runtime()->policy(),
-              target_device->cpu_runtime()->use_gemmlowp())) {
+              &target_device->cpu_runtime()->thread_pool())) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
   // quantize model flag
   bool is_quantize_model = IsQuantizedModel(*net_def);
@@ -154,7 +154,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   }
   for (auto &tensor : net_def->tensors()) {
     tensor_shape_map[tensor.name()] =
-      std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
+        std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
   }
 
   bool has_data_format = false;
diff --git a/mace/core/quantize.cc b/mace/core/quantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..167c6da356cb975eaed53ce87343fdd3185ce854
--- /dev/null
+++ b/mace/core/quantize.cc
@@ -0,0 +1,130 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif  // MACE_ENABLE_NEON
+
+#include "mace/core/quantize.h"
+
+namespace mace {
+
+#ifdef MACE_ENABLE_NEON
+
+template<>
+void QuantizeUtil<uint8_t>::QuantizeWithScaleAndZeropoint(
+    const float *input,
+    const index_t size,
+    float scale,
+    int32_t zero_point,
+    uint8_t *output) {
+  const float32x4_t vround = vdupq_n_f32(0.5);
+  const float32x4_t
+      vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point)));
+  const float recip_scale = 1.f / scale;
+  const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale);
+  const index_t block_count = size / 16;
+
+  thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t i = start; i < end; i += step) {
+      float32x4_t vi0 = vld1q_f32(input + i * 16);
+      float32x4_t vi1 = vld1q_f32(input + i * 16 + 4);
+      float32x4_t vi2 = vld1q_f32(input + i * 16 + 8);
+      float32x4_t vi3 = vld1q_f32(input + i * 16 + 12);
+
+      int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale));
+      int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale));
+      int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale));
+      int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale));
+
+      uint8x8_t vo0_u8 =
+          vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32)));
+      uint8x8_t vo1_u8 =
+          vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32)));
+      uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8);
+
+      vst1q_u8(output + i * 16, vo);
+    }
+  }, 0, block_count, 1);
+
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] =
+        Saturate<uint8_t>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<>
+void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
+                                       const index_t size,
+                                       const float scale,
+                                       const int32_t zero_point,
+                                       float *output) {
+  const index_t block_count = size / 16;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t i = start; i < end; i += step) {
+      uint8x16_t vi = vld1q_u8(input + i * 16);
+      float32x4x4_t vo = {
+          vmulq_f32(vscale,
+                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                        vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+          vmulq_f32(vscale,
+                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                        vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+          vmulq_f32(vscale,
+                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                        vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+          vmulq_f32(vscale,
+                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                        vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+      };
+      vst1q_f32(output + i * 16, vo.val[0]);
+      vst1q_f32(output + i * 16 + 4, vo.val[1]);
+      vst1q_f32(output + i * 16 + 8, vo.val[2]);
+      vst1q_f32(output + i * 16 + 12, vo.val[3]);
+    }
+  }, 0, block_count, 1);
+
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+template<>
+void QuantizeUtil<int32_t>::Dequantize(const int *input,
+                                       const index_t size,
+                                       const float scale,
+                                       const int32_t zero_point,
+                                       float *output) {
+  const index_t block_count = size / 4;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+  thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t i = start; i < end; i += step) {
+      int32x4_t vi = vld1q_s32(input + i * 4);
+      float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero)));
+      vst1q_f32(output + i * 4, vo);
+    }
+  }, 0, block_count, 1);
+
+  for (index_t i = block_count * 4; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+#endif
+
+}  // namespace mace
diff --git a/mace/core/quantize.h b/mace/core/quantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e755bf0e7af22f0424ef5c84f8384699d041d12
--- /dev/null
+++ b/mace/core/quantize.h
@@ -0,0 +1,232 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_QUANTIZE_H_
+#define MACE_CORE_QUANTIZE_H_
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "mace/utils/logging.h"
+#include "mace/utils/thread_pool.h"
+#include "mace/core/tensor.h"
+
+namespace mace {
+
+template<typename T>
+inline void AdjustRange(const float in_min_data,
+                        const float in_max_data,
+                        const bool non_zero,
+                        float *scale,
+                        int32_t *zero_point) {
+  // re-range to make range include zero float and
+  // make zero float as integer u8
+  const T quantized_min = std::numeric_limits<T>::lowest();
+  const T quantized_max = std::numeric_limits<T>::max();
+  if (quantized_min < 0) {
+    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
+  }
+
+  float out_max = std::max(0.f, in_max_data);
+  float out_min = std::min(0.f, in_min_data);
+  // make in_min_data quantize as greater than 1
+  if (non_zero) {
+    out_min = std::min(out_min,
+                       in_min_data - (out_max - in_min_data)
+                           / (quantized_max - quantized_min - 1));
+  }
+
+  *scale = (out_max - out_min) / (quantized_max - quantized_min);
+  const float kEps = 1e-6;
+  if (out_min < -kEps && out_max > kEps) {
+    float quantized_zero = -out_min / *scale;
+    int32_t
+        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
+    *zero_point = quantized_zero_near_int;
+    if (fabs(quantized_zero - quantized_zero_near_int) > kEps && non_zero) {
+      *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
+    }
+  } else if (out_min > -kEps) {
+    *zero_point = quantized_min;
+  } else {
+    *zero_point = quantized_max;
+  }
+}
+
+template<typename T>
+inline T Saturate(float value) {
+  int rounded_value = static_cast<int>(value);
+  if (rounded_value <= std::numeric_limits<T>::lowest()) {
+    return std::numeric_limits<T>::lowest();
+  } else if (rounded_value >= std::numeric_limits<T>::max()) {
+    return std::numeric_limits<T>::max();
+  } else {
+    return static_cast<T>(rounded_value);
+  }
+}
+
+inline void FindMinMax(const float *input,
+                       const index_t size,
+                       float *min_val, float *max_val) {
+  float max_v = std::numeric_limits<float>::lowest();
+  float min_v = std::numeric_limits<float>::max();
+  for (index_t i = 0; i < size; ++i) {
+    max_v = std::max(max_v, input[i]);
+    min_v = std::min(min_v, input[i]);
+  }
+  *min_val = min_v;
+  *max_val = max_v;
+}
+
+inline void QuantizeMultiplier(double multiplier,
+                               int32_t *output_multiplier,
+                               int32_t *shift) {
+  const double q = std::frexp(multiplier, shift);
+  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
+  if (qint == (1ll << 31)) {
+    qint /= 2;
+    ++*shift;
+  }
+  *output_multiplier = static_cast<int32_t>(qint);
+  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
+}
+
+inline void GetOutputMultiplierAndShift(
+    const float lhs_scale, const float rhs_scale, const float output_scale,
+    int32_t *quantized_multiplier, int *right_shift) {
+  float real_multiplier = lhs_scale * rhs_scale / output_scale;
+  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
+
+  int exponent;
+  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
+  *right_shift = -exponent;
+  MACE_CHECK(*right_shift >= 0);
+}
+
+template<typename T>
+class QuantizeUtil {
+ public:
+  explicit QuantizeUtil(utils::ThreadPool *thread_pool)
+      : thread_pool_(thread_pool) {}
+
+  void QuantizeWithScaleAndZeropoint(const float *input,
+                                     const index_t size,
+                                     float scale,
+                                     int32_t zero_point,
+                                     T *output) {
+    float recip_scale = 1 / scale;
+    thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t i = start; i < end; i += step) {
+        output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
+      }
+    }, 0, size, 1);
+  }
+
+  void Quantize(const float *input,
+                const index_t size,
+                bool non_zero,
+                T *output,
+                float *scale,
+                int32_t *zero_point) {
+    float in_min_data;
+    float in_max_data;
+    FindMinMax(input, size, &in_min_data, &in_max_data);
+
+    AdjustRange<T>(in_min_data, in_max_data, non_zero,
+                   scale, zero_point);
+
+    QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
+  }
+
+  void Quantize(const Tensor &input,
+                Tensor *output,
+                float *min_out,
+                float *max_out) {
+    MACE_CHECK(input.size() != 0);
+    Tensor::MappingGuard input_guard(&input);
+    Tensor::MappingGuard output_guard(output);
+    auto *input_data = input.data<float>();
+    auto *output_data = output->mutable_data<T>();
+    float scale;
+    int32_t zero_point;
+
+    Quantize(input_data, input.size(), false, output_data, &scale, &zero_point);
+
+    *min_out = scale * (std::numeric_limits<T>::lowest() - zero_point);
+    *max_out = scale * (std::numeric_limits<T>::max() - zero_point);
+  }
+
+  void Dequantize(const T *input,
+                  const index_t size,
+                  const float scale,
+                  const int32_t zero_point,
+                  float *output) {
+    thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t i = start; i < end; i += step) {
+        output[i] = scale * (input[i] - zero_point);
+      }
+    }, 0, size, 1);
+  }
+
+  void DeQuantize(const Tensor &input,
+                  const float min_in,
+                  const float max_in,
+                  Tensor *output) {
+    MACE_CHECK(input.size() != 0);
+    Tensor::MappingGuard input_guard(&input);
+    Tensor::MappingGuard output_guard(output);
+    auto *input_data = input.data<T>();
+    auto *output_data = output->mutable_data<float>();
+    float scale;
+    int32_t zero_point;
+
+    AdjustRange<T>(min_in, max_in, false, &scale, &zero_point);
+
+    Dequantize(input_data, input.size(), scale, zero_point, output_data);
+  }
+
+ private:
+  utils::ThreadPool *thread_pool_;
+};
+
+#ifdef MACE_ENABLE_NEON
+
+template<>
+void QuantizeUtil<uint8_t>::QuantizeWithScaleAndZeropoint(
+    const float *input,
+    const index_t size,
+    float scale,
+    int32_t zero_point,
+    uint8_t *output);
+
+template<>
+void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
+                                       const index_t size,
+                                       const float scale,
+                                       const int32_t zero_point,
+                                       float *output);
+
+template<>
+void QuantizeUtil<int32_t>::Dequantize(const int *input,
+                                       const index_t size,
+                                       const float scale,
+                                       const int32_t zero_point,
+                                       float *output);
+
+#endif
+
+}  // namespace mace
+
+#endif  // MACE_CORE_QUANTIZE_H_
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index ae3689c2f35cceb13d68e7a91b415dfadeb9fc37..ad60447e706613252affbac36d635a8f88193a71 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -68,7 +68,7 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 #else
   MACE_UNUSED(omp_num_threads);
   MACE_UNUSED(schedule_policy);
-  LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
+  VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
 
 #ifdef MACE_ENABLE_OPENMP
@@ -143,7 +143,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
 #ifdef MACE_ENABLE_OPENMP
     omp_set_num_threads(num_threads_hint);
 #else
-    LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
+    VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
     return MaceStatus::MACE_SUCCESS;
   }
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 08584dd91865b33c23b8cdf42e696b43390b14b9..f8cb2111cedd5fa125a6dae264c2210359900c8d 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -35,24 +35,17 @@ class CPURuntime {
  public:
   CPURuntime(const int num_threads,
              CPUAffinityPolicy policy,
-             bool use_gemmlowp)
+             utils::ThreadPool *thread_pool)
       : num_threads_(num_threads),
         policy_(policy),
         gemm_context_(nullptr),
-        thread_pool_(static_cast<size_t>(num_threads), policy) {
+        thread_pool_(thread_pool) {
 #ifdef MACE_ENABLE_QUANTIZE
-    if (use_gemmlowp) {
-      MACE_CHECK_NOTNULL(GetGemmlowpContext());
-    }
-#else
-    MACE_UNUSED(use_gemmlowp);
+    MACE_CHECK_NOTNULL(GetGemmlowpContext());
 #endif  // MACE_ENABLE_QUANTIZE
     SetOpenMPThreadsAndAffinityPolicy(num_threads_,
                                       policy_,
                                       gemm_context_);
-    // TODO(liyin): After we replace OpenMP to thread_pool, uncomment the
-    // following line.
-    // thread_pool_.Init();
   }
 
 #ifdef MACE_ENABLE_QUANTIZE
@@ -80,12 +73,8 @@ class CPURuntime {
     return policy_;
   }
 
-  bool use_gemmlowp() const {
-    return gemm_context_ != nullptr;
-  }
-
   utils::ThreadPool &thread_pool() {
-    return thread_pool_;
+    return *thread_pool_;
   }
 
  private:
@@ -97,7 +86,7 @@ class CPURuntime {
   int num_threads_;
   CPUAffinityPolicy policy_;
   void *gemm_context_;
-  utils::ThreadPool thread_pool_;
+  utils::ThreadPool *thread_pool_;
 };
 }  // namespace mace
 
diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h
index f80607d3196582f850d0911fec0429784cabaca0..b17b19e5469cb5bb01e42f9beecdba286d8454af 100644
--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -31,8 +31,9 @@ namespace mace {
 
 class HexagonDevice : public CPUDevice {
  public:
-  explicit HexagonDevice(DeviceType device_type)
-      : CPUDevice(0, AFFINITY_NONE, false),
+  explicit HexagonDevice(DeviceType device_type,
+                         utils::ThreadPool *thread_pool)
+      : CPUDevice(0, AFFINITY_NONE, thread_pool),
         device_type_(device_type) {}
 
   DeviceType device_type() const override {
@@ -44,9 +45,9 @@ class HexagonDevice : public CPUDevice {
 };
 
 std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
-    DeviceType device_type) {
+    Device *device) {
   std::unique_ptr<HexagonControlWrapper> hexagon_controller;
-
+  auto device_type = device->device_type();
   switch (device_type) {
 #ifdef MACE_ENABLE_HEXAGON
     case HEXAGON:
@@ -55,11 +56,10 @@ std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
 #endif
 #ifdef MACE_ENABLE_HTA
     case HTA:
-      hexagon_controller = make_unique<HexagonHTAWrapper>();
+      hexagon_controller = make_unique<HexagonHTAWrapper>(device);
       break;
 #endif
-    default:
-      LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+    default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
   }
 
   return hexagon_controller;
diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
index 0b285ee2bd7171a4f21baddfee31a0f695d48982..a617e7c7f5f534d8bb765529c28524c1807b96ea 100644
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
+
 #include <algorithm>
 #include <iomanip>
 #include <map>
@@ -22,7 +24,6 @@
 #include <string>
 #include <utility>
 
-#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_dsp_ops.h"
 #include "mace/core/types.h"
 #include "mace/port/env.h"
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
index c4191e7f25dff4f55d6cec283df7a6b0d733b94b..06dadc3a9ae986cf8da9d2da8e4e212edf3d93cd 100644
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
@@ -26,11 +26,15 @@
 #include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
 #include "mace/core/types.h"
 #include "mace/utils/memory.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #include "third_party/hta/hta_hexagon_api.h"
 
 namespace mace {
 
+HexagonHTAWrapper::HexagonHTAWrapper(Device *device)
+    : device_(device), quantize_util_(&device->cpu_runtime()->thread_pool()) {
+}
+
 int HexagonHTAWrapper::GetVersion() {
   int version;
   MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error");
@@ -237,8 +241,8 @@ bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor,
 }
 
 bool HexagonHTAWrapper::ExecuteGraphNew(
-    const std::map<std::string, Tensor*> &input_tensors,
-    std::map<std::string, Tensor*> *output_tensors) {
+    const std::map<std::string, Tensor *> &input_tensors,
+    std::map<std::string, Tensor *> *output_tensors) {
   VLOG(2) << "Execute graph new: " << nn_id_;
   uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
   uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
@@ -261,11 +265,11 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
 
     const float *input_data = input_tensor->data<float>();
     uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data<uint8_t>();
-    QuantizeWithScaleAndZeropoint(input_data,
-                                  input_tensor->size(),
-                                  input_info_[i].scale,
-                                  input_info_[i].zero_point,
-                                  input_data_u8);
+    quantize_util_.QuantizeWithScaleAndZeropoint(input_data,
+                                                 input_tensor->size(),
+                                                 input_info_[i].scale,
+                                                 input_info_[i].zero_point,
+                                                 input_data_u8);
 
     inputs[i].data = const_cast<unsigned char *>(
         reinterpret_cast<const unsigned char *>(
@@ -315,11 +319,11 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
 
     const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data<uint8_t>();
     float *output_data = output_tensor->mutable_data<float>();
-    Dequantize(output_data_u8,
-               output_info_[i].tensor_u8->size(),
-               output_info_[i].scale,
-               output_info_[i].zero_point,
-               output_data);
+    quantize_util_.Dequantize(output_data_u8,
+                              output_info_[i].tensor_u8->size(),
+                              output_info_[i].scale,
+                              output_info_[i].zero_point,
+                              output_data);
   }
 
   return res == 0;
diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
index 66d02e0290c82f7bfcadf17cdba94db8e035db94..af8294b1c2993111ba9f5d31986d6c8346a765a8 100644
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
@@ -19,15 +19,18 @@
 #include <string>
 #include <vector>
 
+#include "mace/utils/thread_pool.h"
+#include "mace/core/quantize.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/tensor.h"
+#include "mace/core/device.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 
 class HexagonHTAWrapper : public HexagonControlWrapper {
  public:
-  HexagonHTAWrapper() = default;
+  explicit HexagonHTAWrapper(Device *device);
 
   int GetVersion() override;
   bool Config() override;
@@ -46,6 +49,9 @@ class HexagonHTAWrapper : public HexagonControlWrapper {
   void ResetPerfInfo() override;
   void SetDebugLevel(int level) override;
 
+ private:
+  Device *device_;
+  QuantizeUtil<uint8_t> quantize_util_;
   MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
 };
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
index 2bdf6802af34983fa1d0b1c3ae8527b46f762152..a4d3f8b268679f89bce320bb08b84330a02cdbdb 100644
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -25,8 +25,10 @@ GPUDevice::GPUDevice(std::shared_ptr<Tuner<uint32_t>> tuner,
                      std::shared_ptr<KVStorage> opencl_binary_storage,
                      const int num_threads,
                      CPUAffinityPolicy cpu_affinity_policy,
-                     bool use_gemmlowp) :
-    CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
+                     utils::ThreadPool *thread_pool) :
+    CPUDevice(num_threads,
+              cpu_affinity_policy,
+              thread_pool),
     runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                                opencl_binary_storage, tuner)),
     allocator_(new OpenCLAllocator(runtime_.get())),
@@ -35,7 +37,7 @@ GPUDevice::GPUDevice(std::shared_ptr<Tuner<uint32_t>> tuner,
 
 GPUDevice::~GPUDevice() = default;
 
-GPURuntime* GPUDevice::gpu_runtime() {
+GPURuntime *GPUDevice::gpu_runtime() {
   return gpu_runtime_.get();
 }
 
diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h
index 768ea378b5bf3dd2128b2cceb97cfca69e0f0323..ef2ceb5a46e943a337b713b2d6c1b7ee846153e5 100644
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -33,7 +33,7 @@ class GPUDevice : public CPUDevice {
             std::shared_ptr<KVStorage> opencl_binary_storage = nullptr,
             const int num_threads = -1,
             CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
-            bool use_gemmlowp = false);
+            utils::ThreadPool *thread_pool = nullptr);
   ~GPUDevice();
   GPURuntime *gpu_runtime() override;
   Allocator *allocator() override;
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 89494eabc4effeca33896ea1fd411acb640a35f4..1082510ac33e067c5ef27c0bc45cce7a9b978540 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -20,6 +20,8 @@
 #include <utility>
 #include <vector>
 
+#include "mace/core/types.h"
+
 #define MACE_BENCHMARK(n) \
   static ::mace::testing::Benchmark *__benchmark_##n = \
       (new ::mace::testing::Benchmark(#n, (n)))
diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc
index da78d3ffc1646e2187f05db04d16dc16c96a8acf..9da650f8192f5e384a2367098938deff19902d81 100644
--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -33,8 +33,7 @@ int main(int argc, char **argv) {
   // config runtime
   mace::ops::test::OpTestContext::Get(
       FLAGS_omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
-      true);
+      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
 
   mace::testing::Benchmark::Run(FLAGS_filter.c_str());
   return 0;
diff --git a/mace/core/types.h b/mace/core/types.h
index 4ac00a54f736ace49e50219b81f32a53996272c4..8dde57fd48d4bfd29405b28bfdcbc05a67d0c897 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -54,6 +54,12 @@ MACE_MAPPING_DATA_TYPE_AND_ENUM(half, DT_HALF);
 MACE_MAPPING_DATA_TYPE_AND_ENUM(float, DT_FLOAT);
 MACE_MAPPING_DATA_TYPE_AND_ENUM(uint8_t, DT_UINT8);
 MACE_MAPPING_DATA_TYPE_AND_ENUM(int32_t, DT_INT32);
+
+enum FrameworkType {
+  TENSORFLOW = 0,
+  CAFFE = 1,
+};
+
 }  // namespace mace
 
 #endif  // MACE_CORE_TYPES_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 8009fda180a7d186ec9e27b0c0751cd34eeb0a11..7cb97fe77cb1a7f4ee6e2e1cf41aaa0d2062070e 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -19,7 +19,7 @@
 
 #include "mace/core/arg_helper.h"
 #include "mace/core/memory_optimizer.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
@@ -95,8 +95,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
     model_data_size = std::max(
         model_data_size,
         static_cast<index_t>(const_tensor.offset() +
-                             const_tensor.data_size() *
-                             GetEnumTypeSize(const_tensor.data_type())));
+            const_tensor.data_size() *
+                GetEnumTypeSize(const_tensor.data_type())));
   }
   VLOG(3) << "Model data size: " << model_data_size;
 
@@ -163,11 +163,13 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
             auto quantized_data = reinterpret_cast<const uint8_t *>(
                 model_data + const_tensor.offset());
             auto dequantized_data = tensor->mutable_data<float>();
-            Dequantize(quantized_data,
-                       tensor->size(),
-                       const_tensor.scale(),
-                       const_tensor.zero_point(),
-                       dequantized_data);
+            QuantizeUtil<uint8_t>
+                quantize_util(&device->cpu_runtime()->thread_pool());
+            quantize_util.Dequantize(quantized_data,
+                                     tensor->size(),
+                                     const_tensor.scale(),
+                                     const_tensor.zero_point(),
+                                     dequantized_data);
           } else {
             tensor->CopyBytes(model_data + const_tensor.offset(),
                               const_tensor.data_size() *
@@ -185,14 +187,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
       if (device_type == DeviceType::CPU) {
         tensor_buffer_ = std::unique_ptr<Buffer>(
             new Buffer(device->allocator(),
-                       const_cast<unsigned char*>(model_data),
+                       const_cast<unsigned char *>(model_data),
                        model_data_size));
       } else {
         tensor_buffer_ = std::unique_ptr<Buffer>(
             new Buffer(device->allocator()));
         MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
         tensor_buffer_->Map(nullptr);
-        tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
+        tensor_buffer_->Copy(const_cast<unsigned char *>(model_data),
                              0, model_data_size);
         tensor_buffer_->UnMap();
       }
diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
index 442e3a1a25f5d22bc8198e0c5b6d87894738f4e8..f3aac339dc957f33432e7036e86c19a2951bfbfa 100755
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -112,8 +112,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
   mace::MaceEngineConfig config(mace_context.device_type);
   status = config.SetCPUThreadPolicy(
       omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy),
-      true);
+      static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
   if (status != mace::MaceStatus::MACE_SUCCESS) {
     __android_log_print(ANDROID_LOG_ERROR,
                         "image_classify attrs",
diff --git a/mace/examples/cli/BUILD.bazel b/mace/examples/cli/BUILD.bazel
index efd4454dafa4fa6d790908b6234822532b0c4098..edd9170d56399309c71b183f44e21ebffe7b3c6d 100644
--- a/mace/examples/cli/BUILD.bazel
+++ b/mace/examples/cli/BUILD.bazel
@@ -5,6 +5,7 @@ load(
     "if_darwin",
     "if_hexagon_enabled",
     "if_hta_enabled",
+    "if_linux",
     "if_opencl_enabled",
     "if_openmp_enabled",
 )
@@ -21,13 +22,12 @@ cc_binary(
     linkopts = [
         "-lm",
         "-ldl",
-    ] + if_darwin(
-        [],
+    ] + if_linux(["-lpthread"]) + if_darwin(
+        ["-lpthread"],
         default_value = ["-fuse-ld=gold"],
     ) + if_openmp_enabled([
         "-fopenmp",
     ]) + if_android([
-        "-ldl",
         "-pie",
         "-llog",
     ]),
@@ -60,11 +60,10 @@ cc_binary(
     linkopts = [
         "-lm",
         "-ldl",
-    ] + if_darwin(
-        [],
+    ] + if_linux(["-lpthread"]) + if_darwin(
+        ["-lpthread"],
         default_value = ["-fuse-ld=gold"],
     ) + if_android([
-        "-ldl",
         "-pie",
         "-llog",
     ]),
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index da43f5e27dafdecf90c3e18ce83bb66d42165343..c5e16b762a57e6eddcebc269d7f369ffabac28dd 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -149,7 +149,7 @@ GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths(
   return *this;
 }
 
-GPUContextBuilder& GPUContextBuilder::SetOpenCLBinary(
+GPUContextBuilder &GPUContextBuilder::SetOpenCLBinary(
     const unsigned char *data, const size_t size) {
   impl_->SetOpenCLBinary(data, size);
   return *this;
@@ -161,7 +161,7 @@ GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath(
   return *this;
 }
 
-GPUContextBuilder& GPUContextBuilder::SetOpenCLParameter(
+GPUContextBuilder &GPUContextBuilder::SetOpenCLParameter(
     const unsigned char *data, const size_t size) {
   impl_->SetOpenCLParameter(data, size);
   return *this;
@@ -181,8 +181,7 @@ class MaceEngineConfig::Impl {
   MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
 
   MaceStatus SetCPUThreadPolicy(int num_threads_hint,
-                                CPUAffinityPolicy policy,
-                                bool use_gemmlowp);
+                                CPUAffinityPolicy policy);
 
   inline DeviceType device_type() const {
     return device_type_;
@@ -196,10 +195,6 @@ class MaceEngineConfig::Impl {
     return cpu_affinity_policy_;
   }
 
-  inline bool use_gemmlowp() const {
-    return use_gemmlowp_;
-  }
-
   inline std::shared_ptr<GPUContext> gpu_context() const {
     return gpu_context_;
   }
@@ -216,7 +211,6 @@ class MaceEngineConfig::Impl {
   DeviceType device_type_;
   int num_threads_;
   CPUAffinityPolicy cpu_affinity_policy_;
-  bool use_gemmlowp_;
   std::shared_ptr<GPUContext> gpu_context_;
   GPUPriorityHint gpu_priority_hint_;
   GPUPerfHint gpu_perf_hint_;
@@ -226,7 +220,6 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type)
     : device_type_(device_type),
       num_threads_(-1),
       cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE),
-      use_gemmlowp_(false),
       gpu_context_(new GPUContext),
       gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
       gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
@@ -247,15 +240,12 @@ MaceStatus MaceEngineConfig::Impl::SetGPUHints(
 
 MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
     int num_threads,
-    CPUAffinityPolicy policy,
-    bool use_gemmlowp) {
+    CPUAffinityPolicy policy) {
   num_threads_ = num_threads;
   cpu_affinity_policy_ = policy;
-  use_gemmlowp_ = use_gemmlowp;
   return MaceStatus::MACE_SUCCESS;
 }
 
-
 MaceEngineConfig::MaceEngineConfig(
     const DeviceType device_type)
     : impl_(new MaceEngineConfig::Impl(device_type)) {}
@@ -275,9 +265,8 @@ MaceStatus MaceEngineConfig::SetGPUHints(
 
 MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
     int num_threads_hint,
-    CPUAffinityPolicy policy,
-    bool use_gemmlowp) {
-  return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
+    CPUAffinityPolicy policy) {
+  return impl_->SetCPUThreadPolicy(num_threads_hint, policy);
 }
 
 // Mace Tensor
@@ -407,6 +396,7 @@ class MaceEngine::Impl {
 #endif
   std::map<std::string, mace::InputOutputInfo> input_info_map_;
   std::map<std::string, mace::InputOutputInfo> output_info_map_;
+  std::unique_ptr<utils::ThreadPool> thread_pool_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
@@ -418,16 +408,19 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
       device_(nullptr),
       ws_(new Workspace()),
       net_(nullptr),
-      is_quantized_model_(false)
+      is_quantized_model_(false),
+      thread_pool_(new utils::ThreadPool(config.impl_->num_threads(),
+                                         config.impl_->cpu_affinity_policy()))
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-      , hexagon_controller_(nullptr)
+, hexagon_controller_(nullptr)
 #endif
 {
   LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+  thread_pool_->Init();
   if (device_type_ == DeviceType::CPU) {
     device_.reset(new CPUDevice(config.impl_->num_threads(),
                                 config.impl_->cpu_affinity_policy(),
-                                config.impl_->use_gemmlowp()));
+                                thread_pool_.get()));
   }
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == DeviceType::GPU) {
@@ -439,12 +432,13 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
         config.impl_->gpu_context()->opencl_binary_storage(),
         config.impl_->num_threads(),
         config.impl_->cpu_affinity_policy(),
-        config.impl_->use_gemmlowp()));
+        thread_pool_.get()));
   }
 #endif
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) {
-    device_.reset(new HexagonDevice(device_type_));
+  if (device_type_ == DeviceType::HEXAGON
+      || device_type_ == DeviceType::HTA) {
+    device_.reset(new HexagonDevice(device_type_, thread_pool_.get()));
   }
 #endif
   MACE_CHECK_NOTNULL(device_);
@@ -506,7 +500,7 @@ MaceStatus MaceEngine::Impl::Init(
   }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   if (device_type_ == HEXAGON || device_type_ == HTA) {
-    hexagon_controller_ = CreateHexagonControlWrapper(device_type_);
+    hexagon_controller_ = CreateHexagonControlWrapper(device_.get());
     MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
     hexagon_controller_->SetDebugLevel(
@@ -518,26 +512,26 @@ MaceStatus MaceEngine::Impl::Init(
     }
   } else {
 #endif
-    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
-                                              device_.get(),
-                                              model_data));
-
-    MemoryOptimizer mem_optimizer;
-    // Init model
-    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
-                                                  net_def,
-                                                  ws_.get(),
-                                                  device_.get(),
-                                                  &mem_optimizer));
-
-    // Preallocate all output tensors of ops
-    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
-                                                      &mem_optimizer,
-                                                      device_.get()));
-    if (device_type_ == DeviceType::GPU) {
-      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
-    }
-    MACE_RETURN_IF_ERROR(net_->Init());
+  MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
+                                            device_.get(),
+                                            model_data));
+
+  MemoryOptimizer mem_optimizer;
+  // Init model
+  net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
+                                                net_def,
+                                                ws_.get(),
+                                                device_.get(),
+                                                &mem_optimizer));
+
+  // Preallocate all output tensors of ops
+  MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
+                                                    &mem_optimizer,
+                                                    device_.get()));
+  if (device_type_ == DeviceType::GPU) {
+    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
+  }
+  MACE_RETURN_IF_ERROR(net_->Init());
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   }
 #endif
@@ -554,10 +548,10 @@ MaceStatus MaceEngine::Impl::Init(
 
   auto fs = GetFileSystem();
   MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
-        model_data_file.c_str(), &model_data_));
+      model_data_file.c_str(), &model_data_));
 
   MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes,
-        reinterpret_cast<const unsigned char *>(model_data_->data())));
+      reinterpret_cast<const unsigned char *>(model_data_->data())));
 
   if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
       device_type_ == DeviceType::HTA ||
@@ -611,18 +605,18 @@ MaceStatus MaceEngine::Impl::TransposeInput(
       Tensor::MappingGuard input_guard(input_tensor);
       if (input_dt == DataType::DT_FLOAT) {
         auto input_data = input_tensor->mutable_data<float>();
-        return ops::Transpose(input.second.data<float>().get(),
+        return ops::Transpose(thread_pool_.get(),
+                              input.second.data<float>().get(),
                               input.second.shape(),
                               dst_dims,
-                              input_data,
-                              input_dt);
+                              input_data);
       } else if (input_dt == DataType::DT_INT32) {
         auto input_data = input_tensor->mutable_data<int>();
-        return ops::Transpose(input.second.data<int>().get(),
+        return ops::Transpose(thread_pool_.get(),
+                              input.second.data<int>().get(),
                               input.second.shape(),
                               dst_dims,
-                              input_data,
-                              input_dt);
+                              input_data);
       } else {
         LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
       }
@@ -668,7 +662,7 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
           output->second.data_format() == NCHW) {
         dst_dims = {0, 3, 1, 2};
       } else {
-        LOG(FATAL) <<"Not supported output data format: "
+        LOG(FATAL) << "Not supported output data format: "
                    << output->second.data_format() << " vs "
                    << output_tensor->data_format();
       }
@@ -688,17 +682,18 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
       Tensor::MappingGuard output_guard(output_tensor);
       if (output_dt == DataType::DT_FLOAT) {
         auto output_data = output_tensor->data<float>();
-        return ops::Transpose(output_data,
+        return ops::Transpose(thread_pool_.get(),
+                              output_data,
                               output_tensor->shape(),
                               dst_dims,
                               output->second.data<float>().get());
       } else if (output_dt == DataType::DT_INT32) {
         auto output_data = output_tensor->data<int>();
-        return ops::Transpose(output_data,
+        return ops::Transpose(thread_pool_.get(),
+                              output_data,
                               output_tensor->shape(),
                               dst_dims,
-                              output->second.data<int>().get(),
-                              output_dt);
+                              output->second.data<int>().get());
       } else {
         LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
         return MaceStatus::MACE_INVALID_ARGS;
@@ -719,8 +714,8 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
                     output_size * sizeof(float));
       } else if (output_dt == DataType::DT_INT32) {
         std::memcpy(output->second.data<int>().get(),
-            output_tensor->data<int>(),
-            output_size * sizeof(int));
+                    output_tensor->data<int>(),
+                    output_size * sizeof(int));
       } else {
         LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
       }
@@ -736,8 +731,8 @@ MaceStatus MaceEngine::Impl::Run(
     std::map<std::string, MaceTensor> *outputs,
     RunMetadata *run_metadata) {
   MACE_CHECK_NOTNULL(outputs);
-  std::map<std::string, Tensor*> input_tensors;
-  std::map<std::string, Tensor*> output_tensors;
+  std::map<std::string, Tensor *> input_tensors;
+  std::map<std::string, Tensor *> output_tensors;
   for (auto &input : inputs) {
     if (input_info_map_.find(input.first) == input_info_map_.end()) {
       LOG(FATAL) << "'" << input.first
@@ -766,7 +761,7 @@ MaceStatus MaceEngine::Impl::Run(
     hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors);
   } else {
 #endif
-    MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
+  MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
   }
 #endif
@@ -785,7 +780,7 @@ MaceStatus MaceEngine::Impl::Run(
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceEngine::MaceEngine(const MaceEngineConfig &config):
+MaceEngine::MaceEngine(const MaceEngineConfig &config) :
     impl_(make_unique<MaceEngine::Impl>(config)) {}
 
 MaceEngine::~MaceEngine() = default;
@@ -797,7 +792,6 @@ MaceStatus MaceEngine::Init(const NetDef *net_def,
   return impl_->Init(net_def, input_nodes, output_nodes, model_data);
 }
 
-
 MaceStatus MaceEngine::Init(const NetDef *net_def,
                             const std::vector<std::string> &input_nodes,
                             const std::vector<std::string> &output_nodes,
diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel
index 255250fd945e388981fb46f7fa5443f624059227..5d2d2cb26668c6ac304c38fbbe14c8e95da96303 100644
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -279,7 +279,6 @@ cc_library(
     srcs = glob(
         [
             "*.cc",
-            "arm/*.cc",  # remove it after refactor
         ],
         exclude = [
             "*_test.cc",
@@ -303,7 +302,6 @@ cc_library(
     hdrs = glob(
         [
             "*.h",
-            "arm/*.h",  # remove it after refactor
         ],
         exclude = [
             "ops_registry.h",
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 29fee227df0ebac83d9a2e8c9a275a62aff8c68a..bcdcd8e062b21c91b3a44bf8dd999237a385f3c6 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -15,9 +15,14 @@
 #include "mace/ops/activation.h"
 
 #include <memory>
-
 #include "mace/core/operator.h"
 
+#if defined(MACE_ENABLE_NEON)
+#include "mace/ops/arm/fp32/activation.h"
+#else
+#include "mace/ops/ref/activation.h"
+#endif
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
@@ -27,52 +32,54 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ActivationOp;
 
-template <>
+template<>
 class ActivationOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit ActivationOp(OpConstructContext *context)
       : Operation(context),
-        activation_(ops::StringToActivationType(
+        activation_type_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
                                                    "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
-                                                          0.0f)),
-        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)) {}
+        activation_delegator_(activation_type_,
+                              Operation::GetOptionalArg<float>("max_limit",
+                                                               0.0f),
+                              Operation::GetOptionalArg<float>(
+                                  "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
-    const float *input_ptr = input->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-    if (activation_ == PRELU) {
+    if (activation_type_ == PRELU) {
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+      const float *input_ptr = input->data<float>();
+      float *output_ptr = output->mutable_data<float>();
       MACE_CHECK(this->InputSize() > 1);
       const Tensor *alpha = this->Input(1);
       const float *alpha_ptr = alpha->data<float>();
       const index_t outer_size = output->dim(0);
       const index_t inner_size = output->dim(2) * output->dim(3);
-      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
+      PReLUActivation(context, input_ptr, outer_size, input->dim(1), inner_size,
                       alpha_ptr, output_ptr);
     } else {
-      DoActivation(input_ptr, output_ptr, output->size(), activation_,
-                   relux_max_limit_, leakyrelu_coefficient_);
+      activation_delegator_.Compute(context, input, output);
     }
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-  ActivationType activation_;
-  float relux_max_limit_;
-  float leakyrelu_coefficient_;
+  ActivationType activation_type_;
+#if defined(MACE_ENABLE_NEON)
+  arm::fp32::Activation activation_delegator_;
+#else
+  ref::Activation activation_delegator_;
+#endif  // MACE_ENABLE_NEON
 };
 
-
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class ActivationOp<DeviceType::GPU, T> : public Operation {
@@ -114,7 +121,6 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterActivation(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 9981652c78d4290289fc2ce8392adc6550fe267c..9ceae6e07ff983e5c577406d60b6616c56da4fc3 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -20,8 +20,8 @@
 #include <string>
 
 #include "mace/core/types.h"
+#include "mace/core/op_context.h"
 #include "mace/ops/common/activation_type.h"
-#include "mace/ops/arm/activation_neon.h"
 #include "mace/utils/logging.h"
 
 namespace mace {
@@ -41,118 +41,39 @@ inline ActivationType StringToActivationType(const std::string type) {
   } else if (type == "NOOP") {
     return ActivationType::NOOP;
   } else if (type == "LEAKYRELU") {
-    return ActivationType ::LEAKYRELU;
+    return ActivationType::LEAKYRELU;
   } else {
     LOG(FATAL) << "Unknown activation type: " << type;
   }
   return ActivationType::NOOP;
 }
 
-template <typename T>
-void DoActivation(const T *input_ptr,
-                  T *output_ptr,
-                  const index_t size,
-                  const ActivationType type,
-                  const float relux_max_limit,
-                  const float leakyrelu_coefficient) {
-  MACE_CHECK(DataTypeToEnum<T>::value != DataType::DT_HALF);
-
-  switch (type) {
-    case NOOP:
-      break;
-    case RELU:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0));
-      }
-      break;
-    case RELUX:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)),
-                                 static_cast<T>(relux_max_limit));
-      }
-      break;
-    case TANH:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = std::tanh(input_ptr[i]);
-      }
-      break;
-    case SIGMOID:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
-      }
-      break;
-    case LEAKYRELU:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0))
-          + leakyrelu_coefficient * std::min(input_ptr[i], static_cast<T>(0));
-      }
-      break;
-    default:
-      LOG(FATAL) << "Unknown activation type: " << type;
-  }
-}
-
-template<>
-inline void DoActivation(const float *input_ptr,
-                         float *output_ptr,
-                         const index_t size,
-                         const ActivationType type,
-                         const float relux_max_limit,
-                         const float leakyrelu_coefficient) {
-  switch (type) {
-    case NOOP:
-      break;
-    case RELU:
-      ReluNeon(input_ptr, size, output_ptr);
-      break;
-    case RELUX:
-      ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
-      break;
-    case TANH:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = std::tanh(input_ptr[i]);
-      }
-      break;
-    case SIGMOID:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
-      }
-      break;
-    case LEAKYRELU:
-      LeakyReluNeon(input_ptr, leakyrelu_coefficient, size, output_ptr);
-      break;
-    default:
-      LOG(FATAL) << "Unknown activation type: " << type;
-  }
-}
-
-template <typename T>
-void PReLUActivation(const T *input_ptr,
+template<typename T>
+void PReLUActivation(const OpContext *context,
+                     const T *input_ptr,
                      const index_t outer_size,
                      const index_t input_chan,
                      const index_t inner_size,
                      const T *alpha_ptr,
                      T *output_ptr) {
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t i = 0; i < outer_size; ++i) {
-    for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
-      for (index_t j = 0; j < inner_size; ++j) {
-        index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j;
-        if (input_ptr[idx] < 0) {
-          output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx];
-        } else {
-          output_ptr[idx] = input_ptr[idx];
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t i = start0; i < end0; i += step0) {
+      for (index_t chan_idx = start1; chan_idx < end1; chan_idx += step1) {
+        for (index_t j = 0; j < inner_size; ++j) {
+          index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j;
+          if (input_ptr[idx] < 0) {
+            output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx];
+          } else {
+            output_ptr[idx] = input_ptr[idx];
+          }
         }
       }
     }
-  }
+  }, 0, outer_size, 1, 0, input_chan, 1);
 }
 
 }  // namespace ops
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 5e387d87684d833eb40c5ebe30e564ef74bb55cd..ea6458d475751a064cacb118cef64ef498a29e48 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -42,61 +42,23 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
-    Tensor *output_tensor = this->Output(0);
-    size_t input_size = this->inputs_.size();
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(inputs_[0]));
-    index_t size = output_tensor->size();
-    Tensor::MappingGuard output_map(output_tensor);
-    float *output_data = output_tensor->mutable_data<float>();
-    memset(output_data, 0, size * sizeof(float));
-    int64_t cost = size * input_size;
-    int64_t groups = 1;
-    if (cost > kCostPerGroup) {
-      groups = cost / kCostPerGroup;
-    }
-    int64_t element_per_group = size / groups;
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(inputs_[0]));
+    const index_t size = output->size();
 
-    std::vector<Tensor::MappingGuard> mappers;
-    for (size_t i = 0; i < input_size; ++i) {
-      MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size());
-      MACE_CHECK(inputs_[0]->size() == inputs_[i]->size())
-        << "Input 0: " << MakeString(inputs_[0]->shape())
-        << ", size: " << inputs_[0]->size() << ". Input " << i << ": "
-        << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size();
-      mappers.emplace_back(Tensor::MappingGuard(inputs_[i]));
-    }
+    Tensor::MappingGuard output_guard(output);
+    auto output_data = output->mutable_data<float>();
+    memset(output_data, 0, size * sizeof(float));
 
-#pragma omp parallel for
-    for (int64_t i = 0; i < size; i += element_per_group) {
-      int64_t count = std::min(element_per_group, size - i);
-      int nn = count >> 2;
-      int remain = count - (nn << 2);
-      for (size_t j = 0; j < input_size; ++j) {
-        const float *input_data = inputs_[j]->data<float>();
-        const float *input_ptr = input_data + i;
-        float *output_ptr = output_data + i;
-        for (int k = 0; k < nn; ++k) {
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-          float32x4_t in = vld1q_f32(input_ptr);
-          float32x4_t out = vld1q_f32(output_ptr);
-          out = vaddq_f32(out, in);
-          vst1q_f32(output_ptr, out);
-#else
-          for (int m = 0; m < 4; ++m) {
-            output_ptr[m] += input_ptr[m];
-          }
-#endif
+    for (auto &input : inputs_) {
+      Tensor::MappingGuard input_guard(input);
+      auto input_data = input->data<float>();
 
-          input_ptr += 4;
-          output_ptr += 4;
-        }
-        for (int k = 0; k < remain; ++k) {
-          *output_ptr += *input_ptr;
-          ++input_ptr;
-          ++output_ptr;
-        }
+      for (index_t j = 0; j < size; ++j) {
+        output_data[j] += input_data[j];
       }
     }
+
     return MaceStatus::MACE_SUCCESS;
   }
 };
diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc
index 71a1090205b15e38d4357fa78ded3883ef9ea536..32007d6ccbcd59cd78670ad7f46aced4a3e6fa4c 100644
--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -71,7 +71,6 @@ class ArgMaxOp : public Operation {
     index_t inner_size = input->dim(axis_value);
 
     if (argmin_) {
-#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < outer_size; ++i) {
         int idx = 0;
         T min_value = std::numeric_limits<T>::max();
@@ -85,7 +84,6 @@ class ArgMaxOp : public Operation {
         output_data[i] = idx;
       }
     } else {
-#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < outer_size; ++i) {
         int idx = 0;
         T max_value = std::numeric_limits<T>::lowest();
diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc
deleted file mode 100644
index 09cfd8d4e0e0bd7ba09bf5f7e31c1bb57afa818b..0000000000000000000000000000000000000000
--- a/mace/ops/arm/activation_neon.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include <algorithm>
-#include "mace/ops/arm/activation_neon.h"
-
-namespace mace {
-namespace ops {
-
-void ReluNeon(const float *input, const index_t size, float *output) {
-#if defined(MACE_ENABLE_NEON)
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i <= size - 4; i += 4) {
-    float32x4_t v = vld1q_f32(input + i);
-    v = vmaxq_f32(v, vzero);
-    vst1q_f32(output + i, v);
-  }
-  // remain
-  for (index_t i = (size >> 2) << 2; i < size; ++i) {
-    output[i] = std::max(input[i], 0.f);
-  }
-#else
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < size; ++i) {
-    output[i] = std::max(input[i], 0.f);
-  }
-#endif
-}
-
-void ReluxNeon(const float *input, const float limit,
-               const index_t size, float *output) {
-#if defined(MACE_ENABLE_NEON)
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  float32x4_t vlimit = vdupq_n_f32(limit);
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i <= size - 4; i += 4) {
-    float32x4_t v = vld1q_f32(input + i);
-    v = vmaxq_f32(v, vzero);
-    v = vminq_f32(v, vlimit);
-    vst1q_f32(output + i, v);
-  }
-  // remain
-  for (index_t i = (size >> 2) << 2; i < size; ++i) {
-    output[i] = std::min(std::max(input[i], 0.f), limit);
-  }
-#else
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < size; ++i) {
-    output[i] = std::min(std::max(input[i], 0.f), limit);
-  }
-#endif
-}
-
-void LeakyReluNeon(const float *input, const float alpha,
-                   const index_t size, float *output) {
-#if defined(MACE_ENABLE_NEON)
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  float32x4_t valpha = vdupq_n_f32(alpha);
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i <= size - 4; i += 4) {
-    float32x4_t v = vld1q_f32(input + i);
-    float32x4_t u = vminq_f32(v, vzero);;
-    v = vmaxq_f32(v, vzero);
-    v = vmlaq_f32(v, valpha, u);
-
-    vst1q_f32(output + i, v);
-  }
-  // remain
-  for (index_t i = (size >> 2) << 2; i < size; ++i) {
-    output[i] = std::max(input[i], 0.f) + std::min(input[i], 0.f) * alpha;
-  }
-#else
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < size; ++i) {
-    output[i] = std::max(input[i], 0.f) + std::min(input[i], 0.f) * alpha;
-  }
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/deconv_2d_neon.h b/mace/ops/arm/deconv_2d_neon.h
deleted file mode 100644
index f45fa923bdd19c6420a4ab0e6b751541ce3b1f76..0000000000000000000000000000000000000000
--- a/mace/ops/arm/deconv_2d_neon.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_DECONV_2D_NEON_H_
-#define MACE_OPS_ARM_DECONV_2D_NEON_H_
-
-#include "mace/core/types.h"
-#include "mace/ops/arm/common_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Deconv2dNeonK2x2S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-void Deconv2dNeonK2x2S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-void Deconv2dNeonK3x3S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-void Deconv2dNeonK3x3S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-void Deconv2dNeonK4x4S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-void Deconv2dNeonK4x4S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output);
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_DECONV_2D_NEON_H_
diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc
deleted file mode 100644
index 674864c8b6527631d4d5800a9e892bc662826bc7..0000000000000000000000000000000000000000
--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/deconv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Deconv2dNeonK2x2S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < outch; oc += 2) {
-      if (oc + 1 < outch) {
-        float *out_base0 = output + (b * outch + oc) * out_img_size;
-        float *out_base1 = out_base0 + out_img_size;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input + (b * inch + ic) * h * w;
-          const float *kernel_base0 = filter + (oc * inch + ic) * 4;
-          const float *kernel_base1 = kernel_base0 + inch * 4;
-          const float *in = input_base;
-          // output channel 0
-          const float *k0 = kernel_base0;
-          // output channel 1
-          const float *k1 = kernel_base1;
-#if defined(MACE_ENABLE_NEON)
-          // load filter
-          float32x4_t k0_vec = vld1q_f32(k0);
-          float32x4_t k1_vec = vld1q_f32(k1);
-#endif
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base0 = out_base0 + i * outw;
-            float *out_row0_0 = out_row_base0;
-            float *out_row0_1 = out_row_base0 + outw;
-
-            float *out_row_base1 = out_base1 + i * outw;
-            float *out_row1_0 = out_row_base1;
-            float *out_row1_1 = out_row_base1 + outw;
-
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00, out01, out02, out03;
-              float32x4_t out10, out11, out12, out13;
-
-              out00 = vld1q_f32(out_row0_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row0_0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_1);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row0_1, out02);
-
-              out03 = vld1q_f32(out_row0_1 + 1);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row0_1 + 1, out03);
-
-              out10 = vld1q_f32(out_row1_0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
-              vst1q_f32(out_row1_0, out10);
-
-              out11 = vld1q_f32(out_row1_0 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
-              vst1q_f32(out_row1_0 + 1, out11);
-
-              out12 = vld1q_f32(out_row1_1);
-              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
-              vst1q_f32(out_row1_1, out12);
-
-              out13 = vld1q_f32(out_row1_1 + 1);
-              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
-              vst1q_f32(out_row1_1 + 1, out13);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-              out_row1_0 += 4;
-              out_row1_1 += 4;
-            }
-#endif
-            for (; j < w; ++j) {
-              float val = in[0];
-              for (int k = 0; k < 2; ++k) {
-                out_row0_0[k] += val * k0[k];
-                out_row0_1[k] += val * k0[k + 2];
-                out_row1_0[k] += val * k1[k];
-                out_row1_1[k] += val * k1[k + 2];
-              }
-              in++;
-              out_row0_0++;
-              out_row0_1++;
-              out_row1_0++;
-              out_row1_1++;
-            }
-          }
-        }
-      } else {
-        float *out_base0 = output + (b * outch + oc) * outh * outw;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input + (b * inch + ic) * h * w;
-          const float *kernel_base0 = filter + (oc * inch + ic) * 4;
-          const float *in = input_base;
-          const float *k0 = kernel_base0;
-
-#if defined(MACE_ENABLE_NEON)
-          // load filter
-          float32x4_t k0_vec = vld1q_f32(k0);
-#endif
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base0 = out_base0 + i * outw;
-            float *out_row0_0 = out_row_base0;
-            float *out_row0_1 = out_row_base0 + outw;
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-              float32x4_t out00, out01, out02, out03;
-
-              out00 = vld1q_f32(out_row0_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row0_0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_1);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row0_1, out02);
-
-              out03 = vld1q_f32(out_row0_1 + 1);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row0_1 + 1, out03);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-            }
-#endif
-            for (; j < w; ++j) {
-              float val = in[0];
-              for (int k = 0; k < 2; ++k) {
-                out_row0_0[k] += val * k0[k];
-                out_row0_1[k] += val * k0[k + 2];
-              }
-              in++;
-              out_row0_0++;
-              out_row0_1++;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void Deconv2dNeonK2x2S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < outch; ++oc) {
-      float *out_base = output + (b * outch + oc) * out_img_size;
-      for (index_t ic = 0; ic < inch; ++ic) {
-        const float *input_base = input + (b * inch + ic) * h * w;
-        const float *kernel_base = filter + (oc * inch + ic) * 4;
-        const float *in = input_base;
-        const float *k0 = kernel_base;
-#if defined(MACE_ENABLE_NEON)
-        float32x4_t k0_vec = vld1q_f32(k0);
-#endif
-        for (index_t i = 0; i < h; ++i) {
-          float *out_row_base = out_base + i * 2 * outw;
-          float *out_row_0 = out_row_base;
-          float *out_row_1 = out_row_0 + outw;
-
-          index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-          for (; j + 3 < w; j += 4) {
-            float32x4_t in_vec = vld1q_f32(in);
-
-            // out row 0
-            float32x4x2_t out00 = vld2q_f32(out_row_0);
-            out00.val[0] =
-              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
-            out00.val[1] =
-              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0, out00);
-
-            // out row 1
-            float32x4x2_t out10 = vld2q_f32(out_row_1);
-            out10.val[0] =
-              neon_vfma_lane_2(out10.val[0], in_vec, k0_vec);
-            out10.val[1] =
-              neon_vfma_lane_3(out10.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_1, out10);
-
-            in += 4;
-            out_row_0 += 8;
-            out_row_1 += 8;
-          }
-#endif
-          for (; j < w; ++j) {
-            float val = in[0];
-            for (int k = 0; k < 2; ++k) {
-              out_row_0[k] += val * k0[k];
-              out_row_1[k] += val * k0[k + 2];
-            }
-            in++;
-            out_row_0 += 2;
-            out_row_1 += 2;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
deleted file mode 100644
index 04f62325817f5a02919ea859c3e5c5ba4a974f40..0000000000000000000000000000000000000000
--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/deconv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Deconv2dNeonK3x3S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < outch; oc += 2) {
-      if (oc + 1 < outch) {
-        float *out_base0 = output + (b * outch + oc) * out_img_size;
-        float *out_base1 = out_base0 + out_img_size;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input + (b * inch + ic) * h * w;
-          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
-          const float *kernel_base1 = kernel_base0 + inch * 9;
-          const float *in = input_base;
-
-          // output channel 0
-          const float *k0_0 = kernel_base0;
-          const float *k0_1 = kernel_base0 + 3;
-          const float *k0_2 = kernel_base0 + 5;
-          // output channel 1
-          const float *k1_0 = kernel_base1;
-          const float *k1_1 = kernel_base1 + 3;
-          const float *k1_2 = kernel_base1 + 5;
-
-#if defined(MACE_ENABLE_NEON)
-          // load filter
-          float32x4_t k00_vec, k01_vec, k02_vec;
-          float32x4_t k10_vec, k11_vec, k12_vec;
-
-          k00_vec = vld1q_f32(k0_0);
-          k01_vec = vld1q_f32(k0_1);
-          k02_vec = vld1q_f32(k0_2);
-
-          k10_vec = vld1q_f32(k1_0);
-          k11_vec = vld1q_f32(k1_1);
-          k12_vec = vld1q_f32(k1_2);
-#endif
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base0 = out_base0 + i * outw;
-            float *out_row0_0 = out_row_base0;
-            float *out_row0_1 = out_row_base0 + outw;
-            float *out_row0_2 = out_row_base0 + 2 * outw;
-
-            float *out_row_base1 = out_base1 + i * outw;
-            float *out_row1_0 = out_row_base1;
-            float *out_row1_1 = out_row_base1 + outw;
-            float *out_row1_2 = out_row_base1 + 2 * outw;
-
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00, out01, out02;
-              float32x4_t out10, out11, out12;
-              float32x4_t out20, out21, out22;
-
-              out00 = vld1q_f32(out_row0_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
-              vst1q_f32(out_row0_0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row0_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row0_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row0_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row0_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row0_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row0_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 2, out22);
-
-              out00 = vld1q_f32(out_row1_0 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 0, out00);
-
-              out01 = vld1q_f32(out_row1_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row1_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row1_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row1_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row1_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row1_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row1_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row1_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 2, out22);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-              out_row0_2 += 4;
-              out_row1_0 += 4;
-              out_row1_1 += 4;
-              out_row1_2 += 4;
-            }
-#endif
-            for (; j < w; ++j) {
-              float val = in[0];
-              for (int k = 0; k < 3; ++k) {
-                out_row0_0[k] += val * k0_0[k];
-                out_row0_1[k] += val * k0_1[k];
-                out_row0_2[k] += val * k0_2[k + 1];
-                out_row1_0[k] += val * k1_0[k];
-                out_row1_1[k] += val * k1_1[k];
-                out_row1_2[k] += val * k1_2[k + 1];
-              }
-              in++;
-              out_row0_0++;
-              out_row0_1++;
-              out_row0_2++;
-              out_row1_0++;
-              out_row1_1++;
-              out_row1_2++;
-            }
-          }
-        }
-      } else {
-        float *out_base0 = output + (b * outch + oc) * outh * outw;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input + (b * inch + ic) * h * w;
-          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
-          const float *in = input_base;
-          const float *k0_0 = kernel_base0;
-          const float *k0_1 = kernel_base0 + 3;
-          const float *k0_2 = kernel_base0 + 5;
-
-#if defined(MACE_ENABLE_NEON)
-          // load filter
-          float32x4_t k00_vec = vld1q_f32(k0_0);
-          float32x4_t k01_vec = vld1q_f32(k0_1);
-          float32x4_t k02_vec = vld1q_f32(k0_2);
-#endif
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base0 = out_base0 + i * outw;
-            float *out_row0_0 = out_row_base0;
-            float *out_row0_1 = out_row_base0 + outw;
-            float *out_row0_2 = out_row_base0 + 2 * outw;
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00, out01, out02;
-              float32x4_t out10, out11, out12;
-              float32x4_t out20, out21, out22;
-
-              out00 = vld1q_f32(out_row0_0 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row0_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row0_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row0_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row0_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row0_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row0_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 2, out22);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-              out_row0_2 += 4;
-            }
-#endif
-            for (; j < w; ++j) {
-              float val = in[0];
-              for (int k = 0; k < 3; ++k) {
-                out_row0_0[k] += val * k0_0[k];
-                out_row0_1[k] += val * k0_1[k];
-                out_row0_2[k] += val * k0_2[k + 1];
-              }
-              in++;
-              out_row0_0++;
-              out_row0_1++;
-              out_row0_2++;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void Deconv2dNeonK3x3S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < outch; ++oc) {
-      float *out_base = output + (b * outch + oc) * out_img_size;
-      for (index_t ic = 0; ic < inch; ++ic) {
-        const float *input_base = input + (b * inch + ic) * h * w;
-        const float *kernel_base = filter + (oc * inch + ic) * 9;
-        const float *in = input_base;
-
-        const float *k0 = kernel_base;
-        const float *k1 = kernel_base + 3;
-        const float *k2 = kernel_base + 5;
-
-#if defined(MACE_ENABLE_NEON)
-        float32x4_t k0_vec = vld1q_f32(k0);
-        float32x4_t k1_vec = vld1q_f32(k1);
-        float32x4_t k2_vec = vld1q_f32(k2);
-#endif
-        for (index_t i = 0; i < h; ++i) {
-          float *out_row_base = out_base + i * 2 * outw;
-          float *out_row_0 = out_row_base;
-          float *out_row_1 = out_row_0 + outw;
-          float *out_row_2 = out_row_1 + outw;
-
-          index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-          for (index_t n = 0; n + 9 < outw; n += 8) {
-            float32x4_t in_vec = vld1q_f32(in);
-
-            // out row 0
-            float32x4x2_t out00 = vld2q_f32(out_row_0);
-            out00.val[0] =
-              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
-            out00.val[1] =
-              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0, out00);
-
-            float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
-            out01.val[0] =
-              neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
-            vst2q_f32(out_row_0 + 2, out01);
-
-            // out row 1
-            float32x4x2_t out10 = vld2q_f32(out_row_1);
-            out10.val[0] =
-              neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
-            out10.val[1] =
-              neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1, out10);
-
-            float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
-            out11.val[0] =
-              neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
-            vst2q_f32(out_row_1 + 2, out11);
-
-            // out row 2
-            float32x4x2_t out20 = vld2q_f32(out_row_2);
-            out20.val[0] =
-              neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
-            out20.val[1] =
-              neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2, out20);
-
-            float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
-            out21.val[0] =
-              neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
-            vst2q_f32(out_row_2 + 2, out21);
-
-            in += 4;
-            out_row_0 += 8;
-            out_row_1 += 8;
-            out_row_2 += 8;
-            j += 4;
-          }
-#endif
-          for (; j < w; ++j) {
-            float val = in[0];
-
-            for (int k = 0; k < 3; ++k) {
-              out_row_0[k] += val * k0[k];
-              out_row_1[k] += val * k1[k];
-              out_row_2[k] += val * k2[k + 1];
-            }
-
-            in++;
-            out_row_0 += 2;
-            out_row_1 += 2;
-            out_row_2 += 2;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
deleted file mode 100644
index 443a188f322c448c6e8bf36b14b3babc91725cf4..0000000000000000000000000000000000000000
--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ /dev/null
@@ -1,506 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/deconv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void Deconv2dNeonK4x4S1(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t inch = in_shape[1];
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t out_img_size = outh * outw;
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t oc = 0; oc < outch; oc += 2) {
-      if (oc + 1 < outch) {
-        float *out_base = output + (b * outch + oc) * out_img_size;
-        float *out_base1 = out_base + out_img_size;
-        for (index_t q = 0; q < inch; q++) {
-          const float *input_base = input + (b * inch + q) * h * w;
-          const float *in = input_base;
-          const float *kernel_base = filter + (oc * inch + q) * 16;
-          const float *k0 = kernel_base;
-          const float *k1 = kernel_base + 4;
-          const float *k2 = kernel_base + 8;
-          const float *k3 = kernel_base + 12;
-
-          const float *kernel_base1 = kernel_base + inch * 16;
-          const float *k10 = kernel_base1;
-          const float *k11 = kernel_base1 + 4;
-          const float *k12 = kernel_base1 + 8;
-          const float *k13 = kernel_base1 + 12;
-#if defined(MACE_ENABLE_NEON)
-          float32x4_t k0_vec = vld1q_f32(k0);
-          float32x4_t k1_vec = vld1q_f32(k1);
-          float32x4_t k2_vec = vld1q_f32(k2);
-          float32x4_t k3_vec = vld1q_f32(k3);
-
-          float32x4_t k10_vec = vld1q_f32(k10);
-          float32x4_t k11_vec = vld1q_f32(k11);
-          float32x4_t k12_vec = vld1q_f32(k12);
-          float32x4_t k13_vec = vld1q_f32(k13);
-#endif
-          for (index_t i = 0; i < h; i++) {
-            float *out_row = out_base + i * outw;
-
-            float *out_row_0 = out_row;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
-            float *out_row_3 = out_row_2 + outw;
-
-            float *out_row1 = out_base1 + i * outw;
-
-            float *out_row1_0 = out_row1;
-            float *out_row1_1 = out_row1_0 + outw;
-            float *out_row1_2 = out_row1_1 + outw;
-            float *out_row1_3 = out_row1_2 + outw;
-
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-              float32x4_t out00, out01, out02, out03;
-              float32x4_t out10, out11, out12, out13;
-
-              out00 = vld1q_f32(out_row_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row_0, out00);
-
-              out10 = vld1q_f32(out_row1_0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
-              vst1q_f32(out_row1_0, out10);
-
-              out01 = vld1q_f32(out_row_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_0 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 1, out11);
-
-              out02 = vld1q_f32(out_row_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_0 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 2, out12);
-
-              out03 = vld1q_f32(out_row_0 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_0 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_1);
-              out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
-              vst1q_f32(out_row_1, out00);
-
-              out10 = vld1q_f32(out_row1_1);
-              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
-              vst1q_f32(out_row1_1, out10);
-
-              out01 = vld1q_f32(out_row_1 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 1, out11);
-
-              out02 = vld1q_f32(out_row_1 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 2, out12);
-
-              out03 = vld1q_f32(out_row_1 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_1 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_2 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 0, out00);
-
-              out10 = vld1q_f32(out_row1_2 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 0, out10);
-
-              out01 = vld1q_f32(out_row_2 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_2 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 1, out11);
-
-              out02 = vld1q_f32(out_row_2 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_2 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 2, out12);
-
-              out03 = vld1q_f32(out_row_2 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_2 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_3 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 0, out00);
-
-              out10 = vld1q_f32(out_row1_3 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 0, out10);
-
-              out01 = vld1q_f32(out_row_3 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_3 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 1, out11);
-
-              out02 = vld1q_f32(out_row_3 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_3 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 2, out12);
-
-              out03 = vld1q_f32(out_row_3 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_3 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 3, out13);
-
-              in += 4;
-              out_row_0 += 4;
-              out_row_1 += 4;
-              out_row_2 += 4;
-              out_row_3 += 4;
-              out_row1_0 += 4;
-              out_row1_1 += 4;
-              out_row1_2 += 4;
-              out_row1_3 += 4;
-            }
-#endif
-            for (; j < w; j++) {
-              float val = in[0];
-              for (int k = 0; k < 4; ++k) {
-                out_row_0[k] += val * k0[k];
-                out_row_1[k] += val * k1[k];
-                out_row_2[k] += val * k2[k];
-                out_row_3[k] += val * k3[k];
-                out_row1_0[k] += val * k10[k];
-                out_row1_1[k] += val * k11[k];
-                out_row1_2[k] += val * k12[k];
-                out_row1_3[k] += val * k13[k];
-              }
-              in++;
-              out_row_0++;
-              out_row_1++;
-              out_row_2++;
-              out_row_3++;
-              out_row1_0++;
-              out_row1_1++;
-              out_row1_2++;
-              out_row1_3++;
-            }
-          }
-        }
-      } else {
-        float *out_base = output + (b * outch + oc) * out_img_size;
-        for (index_t q = 0; q < inch; q++) {
-          const float *input_base = input + (b * inch + q) * h * w;
-          const float *kernel_base = filter + (oc * inch + q) * 16;
-          const float *in = input_base;
-          const float *k0 = kernel_base;
-          const float *k1 = kernel_base + 4;
-          const float *k2 = kernel_base + 8;
-          const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-          float32x4_t k0_vec = vld1q_f32(k0);
-          float32x4_t k1_vec = vld1q_f32(k1);
-          float32x4_t k2_vec = vld1q_f32(k2);
-          float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-          for (index_t i = 0; i < h; i++) {
-            float *out_row = out_base + i * outw;
-            float *out_row_0 = out_row;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
-            float *out_row_3 = out_row_2 + outw;
-            int j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00 = vld1q_f32(out_row_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row_0, out00);
-
-              float32x4_t out01 = vld1q_f32(out_row_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 1, out01);
-
-              float32x4_t out02 = vld1q_f32(out_row_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 2, out02);
-
-              float32x4_t out03 = vld1q_f32(out_row_0 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 3, out03);
-
-              //
-              float32x4_t out10 = vld1q_f32(out_row_1);
-              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
-              vst1q_f32(out_row_1, out10);
-
-              float32x4_t out11 = vld1q_f32(out_row_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 1, out11);
-
-              float32x4_t out12 = vld1q_f32(out_row_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 2, out12);
-
-              float32x4_t out13 = vld1q_f32(out_row_1 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 3, out13);
-
-              //
-              float32x4_t out20 = vld1q_f32(out_row_2 + 0);
-              out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 0, out20);
-
-              float32x4_t out21 = vld1q_f32(out_row_2 + 1);
-              out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 1, out21);
-
-              float32x4_t out22 = vld1q_f32(out_row_2 + 2);
-              out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 2, out22);
-
-              float32x4_t out23 = vld1q_f32(out_row_2 + 3);
-              out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 3, out23);
-
-              //
-              float32x4_t out30 = vld1q_f32(out_row_3 + 0);
-              out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 0, out30);
-
-              float32x4_t out31 = vld1q_f32(out_row_3 + 1);
-              out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 1, out31);
-
-              float32x4_t out32 = vld1q_f32(out_row_3 + 2);
-              out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 2, out32);
-
-              float32x4_t out33 = vld1q_f32(out_row_3 + 3);
-              out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 3, out33);
-
-              in += 4;
-              out_row_0 += 4;
-              out_row_1 += 4;
-              out_row_2 += 4;
-              out_row_3 += 4;
-            }
-#endif
-            for (; j < w; j++) {
-              float val = in[0];
-              for (int k = 0; k < 4; ++k) {
-                out_row_0[k] += val * k0[k];
-                out_row_1[k] += val * k1[k];
-                out_row_2[k] += val * k2[k];
-                out_row_3[k] += val * k3[k];
-              }
-              in++;
-              out_row_0++;
-              out_row_1++;
-              out_row_2++;
-              out_row_3++;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void Deconv2dNeonK4x4S2(const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *out_shape,
-                        float *output) {
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t inch = in_shape[1];
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t p = 0; p < outch; p++) {
-      float *out_base = output + (b * outch + p) * out_img_size;
-      for (index_t q = 0; q < inch; q++) {
-        const float *input_base = input + (b * inch + q) * h * w;
-        const float *kernel_base = filter + (p * inch + q) * 16;
-        const float *in = input_base;
-
-        const float *k0 = kernel_base;
-        const float *k1 = kernel_base + 4;
-        const float *k2 = kernel_base + 8;
-        const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-        float32x4_t k0_vec = vld1q_f32(k0);
-        float32x4_t k1_vec = vld1q_f32(k1);
-        float32x4_t k2_vec = vld1q_f32(k2);
-        float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-        for (index_t i = 0; i < h; i++) {
-          float *out_row = out_base + 2 * i * outw;
-
-          float *out_row_0 = out_row;
-          float *out_row_1 = out_row_0 + outw;
-          float *out_row_2 = out_row_1 + outw;
-          float *out_row_3 = out_row_2 + outw;
-
-          index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-          for (index_t n = 0; n + 9 < outw; n += 8) {
-            float32x4_t in_vec = vld1q_f32(in);
-
-            // row 0
-            float32x4x2_t out0 = vld2q_f32(out_row_0);
-            out0.val[0] =
-              neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
-            out0.val[1] =
-              neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0, out0);
-            out0 = vld2q_f32(out_row_0 + 2);
-            out0.val[0] =
-              neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
-            out0.val[1] =
-              neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0 + 2, out0);
-
-            // row 1
-            float32x4x2_t out1 = vld2q_f32(out_row_1);
-            out1.val[0] =
-              neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
-            out1.val[1] =
-              neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1, out1);
-            out1 = vld2q_f32(out_row_1 + 2);
-            out1.val[0] =
-              neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
-            out1.val[1] =
-              neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1 + 2, out1);
-
-            // row 2
-            float32x4x2_t out2 = vld2q_f32(out_row_2);
-            out2.val[0] =
-              neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
-            out2.val[1] =
-              neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2, out2);
-            out2 = vld2q_f32(out_row_2 + 2);
-            out2.val[0] =
-              neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
-            out2.val[1] =
-              neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2 + 2, out2);
-
-            // row 3
-            float32x4x2_t out3 = vld2q_f32(out_row_3);
-            out3.val[0] =
-              neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
-            out3.val[1] =
-              neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
-            vst2q_f32(out_row_3, out3);
-            out3 = vld2q_f32(out_row_3 + 2);
-            out3.val[0] =
-              neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
-            out3.val[1] =
-              neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
-            vst2q_f32(out_row_3 + 2, out3);
-
-            in += 4;
-            out_row_0 += 8;
-            out_row_1 += 8;
-            out_row_2 += 8;
-            out_row_3 += 8;
-            j += 4;
-          }
-#endif
-          for (; j < w; j++) {
-            float val = in[0];
-            for (int k = 0; k < 4; ++k) {
-              out_row_0[k] += val * k0[k];
-              out_row_1[k] += val * k1[k];
-              out_row_2[k] += val * k2[k];
-              out_row_3[k] += val * k3[k];
-            }
-            in++;
-            out_row_0 += 2;
-            out_row_1 += 2;
-            out_row_2 += 2;
-            out_row_3 += 2;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/depthwise_deconv2d_neon.h b/mace/ops/arm/depthwise_deconv2d_neon.h
deleted file mode 100644
index 8df6dba15bd61d22054f0d0ecac2b35bd060ec76..0000000000000000000000000000000000000000
--- a/mace/ops/arm/depthwise_deconv2d_neon.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
-#define MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
-
-#include "mace/core/types.h"
-#include "mace/ops/arm/common_neon.h"
-
-namespace mace {
-namespace ops {
-
-void DepthwiseDeconv2dNeonK3x3S1(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output);
-
-void DepthwiseDeconv2dNeonK3x3S2(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output);
-
-void DepthwiseDeconv2dNeonK4x4S1(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output);
-
-void DepthwiseDeconv2dNeonK4x4S2(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output);
-
-void GroupDeconv2dNeonK3x3S1(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output);
-
-void GroupDeconv2dNeonK3x3S2(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output);
-
-void GroupDeconv2dNeonK4x4S1(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output);
-
-void GroupDeconv2dNeonK4x4S2(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output);
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_DEPTHWISE_DECONV2D_NEON_H_
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
deleted file mode 100644
index 6bba47c280bfb1fe22055c7440e9180b6afdc98e..0000000000000000000000000000000000000000
--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ /dev/null
@@ -1,629 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/depthwise_deconv2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void DepthwiseDeconv2dNeonK3x3S1(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output) {
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t offset = b * channels + c;
-      float *out_base = output + offset * out_img_size;
-      const float *input_base = input + offset * in_img_size;
-      const float *kernel_base = filter + c * 9;
-      const float *in = input_base;
-      const float *k0 = kernel_base;
-      const float *k1 = kernel_base + 3;
-      const float *k2 = kernel_base + 5;
-
-#if defined(MACE_ENABLE_NEON)
-      // load filter
-      float32x4_t k0_vec = vld1q_f32(k0);
-      float32x4_t k1_vec = vld1q_f32(k1);
-      float32x4_t k2_vec = vld1q_f32(k2);
-#endif
-      for (index_t i = 0; i < h; ++i) {
-        float *out_row_base = out_base + i * outw;
-        float *out_row0 = out_row_base;
-        float *out_row1 = out_row_base + outw;
-        float *out_row2 = out_row_base + 2 * outw;
-        index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-        for (; j + 3 < w; j += 4) {
-          float32x4_t in_vec = vld1q_f32(in);
-
-          float32x4_t out00, out01, out02;
-          float32x4_t out10, out11, out12;
-          float32x4_t out20, out21, out22;
-
-          out00 = vld1q_f32(out_row0 + 0);
-          out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-          vst1q_f32(out_row0 + 0, out00);
-
-          out01 = vld1q_f32(out_row0 + 1);
-          out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-          vst1q_f32(out_row0 + 1, out01);
-
-          out02 = vld1q_f32(out_row0 + 2);
-          out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-          vst1q_f32(out_row0 + 2, out02);
-
-          out10 = vld1q_f32(out_row1 + 0);
-          out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
-          vst1q_f32(out_row1 + 0, out10);
-
-          out11 = vld1q_f32(out_row1 + 1);
-          out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
-          vst1q_f32(out_row1 + 1, out11);
-
-          out12 = vld1q_f32(out_row1 + 2);
-          out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
-          vst1q_f32(out_row1 + 2, out12);
-
-          out20 = vld1q_f32(out_row2 + 0);
-          out20 = neon_vfma_lane_1(out20, in_vec, k2_vec);
-          vst1q_f32(out_row2 + 0, out20);
-
-          out21 = vld1q_f32(out_row2 + 1);
-          out21 = neon_vfma_lane_2(out21, in_vec, k2_vec);
-          vst1q_f32(out_row2 + 1, out21);
-
-          out22 = vld1q_f32(out_row2 + 2);
-          out22 = neon_vfma_lane_3(out22, in_vec, k2_vec);
-          vst1q_f32(out_row2 + 2, out22);
-
-          in += 4;
-          out_row0 += 4;
-          out_row1 += 4;
-          out_row2 += 4;
-        }
-#endif
-        for (; j < w; ++j) {
-          float val = in[0];
-          for (int k = 0; k < 3; ++k) {
-            out_row0[k] += val * k0[k];
-            out_row1[k] += val * k1[k];
-            out_row2[k] += val * k2[k + 1];
-          }
-          in++;
-          out_row0++;
-          out_row1++;
-          out_row2++;
-        }
-      }
-    }
-  }
-}
-
-void DepthwiseDeconv2dNeonK3x3S2(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output) {
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t offset = b * channels + c;
-      float *out_base = output + offset * out_img_size;
-      const float *input_base = input + offset * in_img_size;
-      const float *kernel_base = filter + c * 9;
-      const float *in = input_base;
-
-      const float *k0 = kernel_base;
-      const float *k1 = kernel_base + 3;
-      const float *k2 = kernel_base + 5;
-
-#if defined(MACE_ENABLE_NEON)
-      float32x4_t k0_vec = vld1q_f32(k0);
-      float32x4_t k1_vec = vld1q_f32(k1);
-      float32x4_t k2_vec = vld1q_f32(k2);
-#endif
-      for (index_t i = 0; i < h; ++i) {
-        float *out_row_base = out_base + i * 2 * outw;
-        float *out_row_0 = out_row_base;
-        float *out_row_1 = out_row_0 + outw;
-        float *out_row_2 = out_row_1 + outw;
-
-        index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-        for (index_t n = 0; n + 9 < outw; n += 8) {
-          float32x4_t in_vec = vld1q_f32(in);
-
-          // out row 0
-          float32x4x2_t out00 = vld2q_f32(out_row_0);
-          out00.val[0] =
-            neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
-          out00.val[1] =
-            neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
-          vst2q_f32(out_row_0, out00);
-
-          float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
-          out01.val[0] =
-            neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
-          vst2q_f32(out_row_0 + 2, out01);
-
-          // out row 1
-          float32x4x2_t out10 = vld2q_f32(out_row_1);
-          out10.val[0] =
-            neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
-          out10.val[1] =
-            neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
-          vst2q_f32(out_row_1, out10);
-
-          float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
-          out11.val[0] =
-            neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
-          vst2q_f32(out_row_1 + 2, out11);
-
-          // out row 2
-          float32x4x2_t out20 = vld2q_f32(out_row_2);
-          out20.val[0] =
-            neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
-          out20.val[1] =
-            neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
-          vst2q_f32(out_row_2, out20);
-
-          float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
-          out21.val[0] =
-            neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
-          vst2q_f32(out_row_2 + 2, out21);
-
-          in += 4;
-          out_row_0 += 8;
-          out_row_1 += 8;
-          out_row_2 += 8;
-          j += 4;
-        }
-#endif
-        for (; j < w; ++j) {
-          float val = in[0];
-
-          for (int k = 0; k < 3; ++k) {
-            out_row_0[k] += val * k0[k];
-            out_row_1[k] += val * k1[k];
-            out_row_2[k] += val * k2[k + 1];
-          }
-
-          in++;
-          out_row_0 += 2;
-          out_row_1 += 2;
-          out_row_2 += 2;
-        }
-      }
-    }
-  }
-}
-
-void GroupDeconv2dNeonK3x3S1(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group;
-  const index_t outch_g = outch / group;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (int g = 0; g < group; ++g) {
-      for (index_t oc = 0; oc < outch_g; oc += 2) {
-        if (oc + 1 < outch_g) {
-          const index_t out_offset = b * outch + outch_g * g + oc;
-          float *out_base0 = output + out_offset * out_img_size;
-          float *out_base1 = out_base0 + out_img_size;
-          for (index_t ic = 0; ic < inch_g; ++ic) {
-            const index_t in_offset = b * inch + inch_g * g + ic;
-            const float *input_base = input + in_offset * in_img_size;
-            const index_t kernel_offset = (oc * group + g) * inch_g + ic;
-            const float *kernel_base0 = filter + kernel_offset * 9;
-            const float *kernel_base1 = kernel_base0 + inch * 9;
-            const float *in = input_base;
-
-            // output channel 0
-            const float *k0_0 = kernel_base0;
-            const float *k0_1 = kernel_base0 + 3;
-            const float *k0_2 = kernel_base0 + 5;
-            // output channel 1
-            const float *k1_0 = kernel_base1;
-            const float *k1_1 = kernel_base1 + 3;
-            const float *k1_2 = kernel_base1 + 5;
-
-#if defined(MACE_ENABLE_NEON)
-            // load filter
-          float32x4_t k00_vec, k01_vec, k02_vec;
-          float32x4_t k10_vec, k11_vec, k12_vec;
-
-          k00_vec = vld1q_f32(k0_0);
-          k01_vec = vld1q_f32(k0_1);
-          k02_vec = vld1q_f32(k0_2);
-
-          k10_vec = vld1q_f32(k1_0);
-          k11_vec = vld1q_f32(k1_1);
-          k12_vec = vld1q_f32(k1_2);
-#endif
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
-              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
-
-              float *out_row_base1 = out_base1 + i * outw;
-              float *out_row1_0 = out_row_base1;
-              float *out_row1_1 = out_row_base1 + outw;
-              float *out_row1_2 = out_row_base1 + 2 * outw;
-
-              index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00, out01, out02;
-              float32x4_t out10, out11, out12;
-              float32x4_t out20, out21, out22;
-
-              out00 = vld1q_f32(out_row0_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
-              vst1q_f32(out_row0_0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row0_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row0_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row0_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row0_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row0_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row0_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 2, out22);
-
-              out00 = vld1q_f32(out_row1_0 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 0, out00);
-
-              out01 = vld1q_f32(out_row1_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row1_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row1_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row1_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row1_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row1_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row1_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row1_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 2, out22);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-              out_row0_2 += 4;
-              out_row1_0 += 4;
-              out_row1_1 += 4;
-              out_row1_2 += 4;
-            }
-#endif
-              for (; j < w; ++j) {
-                float val = in[0];
-                for (int k = 0; k < 3; ++k) {
-                  out_row0_0[k] += val * k0_0[k];
-                  out_row0_1[k] += val * k0_1[k];
-                  out_row0_2[k] += val * k0_2[k + 1];
-                  out_row1_0[k] += val * k1_0[k];
-                  out_row1_1[k] += val * k1_1[k];
-                  out_row1_2[k] += val * k1_2[k + 1];
-                }
-                in++;
-                out_row0_0++;
-                out_row0_1++;
-                out_row0_2++;
-                out_row1_0++;
-                out_row1_1++;
-                out_row1_2++;
-              }
-            }
-          }
-        } else {
-          const index_t out_offset = b * outch + outch_g * g + oc;
-          float *out_base0 = output + out_offset * out_img_size;
-          for (index_t ic = 0; ic < inch_g; ++ic) {
-            const index_t in_offset = (b * group + g) * inch_g + ic;
-            const float *input_base = input + in_offset * in_img_size;
-            const index_t kernel_offset = (oc * group + g) * inch_g + ic;
-            const float *kernel_base0 = filter + kernel_offset * 9;
-            const float *in = input_base;
-            const float *k0_0 = kernel_base0;
-            const float *k0_1 = kernel_base0 + 3;
-            const float *k0_2 = kernel_base0 + 5;
-
-#if defined(MACE_ENABLE_NEON)
-            // load filter
-          float32x4_t k00_vec = vld1q_f32(k0_0);
-          float32x4_t k01_vec = vld1q_f32(k0_1);
-          float32x4_t k02_vec = vld1q_f32(k0_2);
-#endif
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
-              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
-              index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00, out01, out02;
-              float32x4_t out10, out11, out12;
-              float32x4_t out20, out21, out22;
-
-              out00 = vld1q_f32(out_row0_0 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 0, out00);
-
-              out01 = vld1q_f32(out_row0_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 1, out01);
-
-              out02 = vld1q_f32(out_row0_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
-              vst1q_f32(out_row0_0 + 2, out02);
-
-              out10 = vld1q_f32(out_row0_1 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 0, out10);
-
-              out11 = vld1q_f32(out_row0_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 1, out11);
-
-              out12 = vld1q_f32(out_row0_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
-              vst1q_f32(out_row0_1 + 2, out12);
-
-              out20 = vld1q_f32(out_row0_2 + 0);
-              out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 0, out20);
-
-              out21 = vld1q_f32(out_row0_2 + 1);
-              out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 1, out21);
-
-              out22 = vld1q_f32(out_row0_2 + 2);
-              out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
-              vst1q_f32(out_row0_2 + 2, out22);
-
-              in += 4;
-              out_row0_0 += 4;
-              out_row0_1 += 4;
-              out_row0_2 += 4;
-            }
-#endif
-              for (; j < w; ++j) {
-                float val = in[0];
-                for (int k = 0; k < 3; ++k) {
-                  out_row0_0[k] += val * k0_0[k];
-                  out_row0_1[k] += val * k0_1[k];
-                  out_row0_2[k] += val * k0_2[k + 1];
-                }
-                in++;
-                out_row0_0++;
-                out_row0_1++;
-                out_row0_2++;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void GroupDeconv2dNeonK3x3S2(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output) {
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group;
-  const index_t outch_g = outch / group;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (int g = 0; g < group; ++g) {
-      for (index_t oc = 0; oc < outch_g; ++oc) {
-        const index_t out_offset = b * outch + outch_g * g + oc;
-        float *out_base = output + out_offset * out_img_size;
-        for (index_t ic = 0; ic < inch_g; ++ic) {
-          const index_t in_offset = b * inch + inch_g * g + ic;
-          const float *input_base = input + in_offset * in_img_size;
-          const index_t kernel_offset = (oc * group + g) * inch_g + ic;
-          const float *kernel_base = filter + kernel_offset * 9;
-          const float *in = input_base;
-
-          const float *k0 = kernel_base;
-          const float *k1 = kernel_base + 3;
-          const float *k2 = kernel_base + 5;
-
-#if defined(MACE_ENABLE_NEON)
-          float32x4_t k0_vec = vld1q_f32(k0);
-        float32x4_t k1_vec = vld1q_f32(k1);
-        float32x4_t k2_vec = vld1q_f32(k2);
-#endif
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base = out_base + i * 2 * outw;
-            float *out_row_0 = out_row_base;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
-
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (index_t n = 0; n + 9 < outw; n += 8) {
-            float32x4_t in_vec = vld1q_f32(in);
-
-            // out row 0
-            float32x4x2_t out00 = vld2q_f32(out_row_0);
-            out00.val[0] =
-              neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
-            out00.val[1] =
-              neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0, out00);
-
-            float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
-            out01.val[0] =
-              neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
-            vst2q_f32(out_row_0 + 2, out01);
-
-            // out row 1
-            float32x4x2_t out10 = vld2q_f32(out_row_1);
-            out10.val[0] =
-              neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
-            out10.val[1] =
-              neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1, out10);
-
-            float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
-            out11.val[0] =
-              neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
-            vst2q_f32(out_row_1 + 2, out11);
-
-            // out row 2
-            float32x4x2_t out20 = vld2q_f32(out_row_2);
-            out20.val[0] =
-              neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
-            out20.val[1] =
-              neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2, out20);
-
-            float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
-            out21.val[0] =
-              neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
-            vst2q_f32(out_row_2 + 2, out21);
-
-            in += 4;
-            out_row_0 += 8;
-            out_row_1 += 8;
-            out_row_2 += 8;
-            j += 4;
-          }
-#endif
-            for (; j < w; ++j) {
-              float val = in[0];
-
-              for (int k = 0; k < 3; ++k) {
-                out_row_0[k] += val * k0[k];
-                out_row_1[k] += val * k1[k];
-                out_row_2[k] += val * k2[k + 1];
-              }
-
-              in++;
-              out_row_0 += 2;
-              out_row_1 += 2;
-              out_row_2 += 2;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
deleted file mode 100644
index 677eb152bb5f7d984a9f7bd003bcbf0e42a1da1f..0000000000000000000000000000000000000000
--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ /dev/null
@@ -1,807 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/macros.h"
-#include "mace/ops/arm/deconv_2d_neon.h"
-
-namespace mace {
-namespace ops {
-
-void DepthwiseDeconv2dNeonK4x4S1(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output) {
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t in_img_size = h * w;
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t offset = b * channels + c;
-      float *out_base = output + offset * out_img_size;
-      const float *input_base = input + offset * in_img_size;
-      const float *kernel_base = filter + c * 16;
-      const float *in = input_base;
-      const float *k0 = kernel_base;
-      const float *k1 = kernel_base + 4;
-      const float *k2 = kernel_base + 8;
-      const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-      float32x4_t k0_vec = vld1q_f32(k0);
-      float32x4_t k1_vec = vld1q_f32(k1);
-      float32x4_t k2_vec = vld1q_f32(k2);
-      float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-      for (index_t i = 0; i < h; i++) {
-        float *out_row = out_base + i * outw;
-        float *out_row_0 = out_row;
-        float *out_row_1 = out_row_0 + outw;
-        float *out_row_2 = out_row_1 + outw;
-        float *out_row_3 = out_row_2 + outw;
-        index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-        for (; j + 3 < w; j += 4) {
-          float32x4_t in_vec = vld1q_f32(in);
-
-          float32x4_t out00 = vld1q_f32(out_row_0);
-          out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-          vst1q_f32(out_row_0, out00);
-
-          float32x4_t out01 = vld1q_f32(out_row_0 + 1);
-          out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-          vst1q_f32(out_row_0 + 1, out01);
-
-          float32x4_t out02 = vld1q_f32(out_row_0 + 2);
-          out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-          vst1q_f32(out_row_0 + 2, out02);
-
-          float32x4_t out03 = vld1q_f32(out_row_0 + 3);
-          out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-          vst1q_f32(out_row_0 + 3, out03);
-
-          //
-          float32x4_t out10 = vld1q_f32(out_row_1);
-          out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
-          vst1q_f32(out_row_1, out10);
-
-          float32x4_t out11 = vld1q_f32(out_row_1 + 1);
-          out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
-          vst1q_f32(out_row_1 + 1, out11);
-
-          float32x4_t out12 = vld1q_f32(out_row_1 + 2);
-          out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
-          vst1q_f32(out_row_1 + 2, out12);
-
-          float32x4_t out13 = vld1q_f32(out_row_1 + 3);
-          out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
-          vst1q_f32(out_row_1 + 3, out13);
-
-          //
-          float32x4_t out20 = vld1q_f32(out_row_2 + 0);
-          out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
-          vst1q_f32(out_row_2 + 0, out20);
-
-          float32x4_t out21 = vld1q_f32(out_row_2 + 1);
-          out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
-          vst1q_f32(out_row_2 + 1, out21);
-
-          float32x4_t out22 = vld1q_f32(out_row_2 + 2);
-          out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
-          vst1q_f32(out_row_2 + 2, out22);
-
-          float32x4_t out23 = vld1q_f32(out_row_2 + 3);
-          out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
-          vst1q_f32(out_row_2 + 3, out23);
-
-          //
-          float32x4_t out30 = vld1q_f32(out_row_3 + 0);
-          out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
-          vst1q_f32(out_row_3 + 0, out30);
-
-          float32x4_t out31 = vld1q_f32(out_row_3 + 1);
-          out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
-          vst1q_f32(out_row_3 + 1, out31);
-
-          float32x4_t out32 = vld1q_f32(out_row_3 + 2);
-          out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
-          vst1q_f32(out_row_3 + 2, out32);
-
-          float32x4_t out33 = vld1q_f32(out_row_3 + 3);
-          out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
-          vst1q_f32(out_row_3 + 3, out33);
-
-          in += 4;
-          out_row_0 += 4;
-          out_row_1 += 4;
-          out_row_2 += 4;
-          out_row_3 += 4;
-        }
-#endif
-        for (; j < w; j++) {
-          float val = in[0];
-          for (int k = 0; k < 4; ++k) {
-            out_row_0[k] += val * k0[k];
-            out_row_1[k] += val * k1[k];
-            out_row_2[k] += val * k2[k];
-            out_row_3[k] += val * k3[k];
-          }
-          in++;
-          out_row_0++;
-          out_row_1++;
-          out_row_2++;
-          out_row_3++;
-        }
-      }
-    }
-  }
-}
-
-void DepthwiseDeconv2dNeonK4x4S2(const float *input,
-                                 const float *filter,
-                                 const index_t *in_shape,
-                                 const index_t *out_shape,
-                                 float *output) {
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t channels = in_shape[1];
-  const index_t in_img_size = h * w;
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t offset = b * channels + c;
-      float *out_base = output + offset * out_img_size;
-      const float *input_base = input + offset * in_img_size;
-      const float *kernel_base = filter + c * 16;
-      const float *in = input_base;
-
-      const float *k0 = kernel_base;
-      const float *k1 = kernel_base + 4;
-      const float *k2 = kernel_base + 8;
-      const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-      float32x4_t k0_vec = vld1q_f32(k0);
-      float32x4_t k1_vec = vld1q_f32(k1);
-      float32x4_t k2_vec = vld1q_f32(k2);
-      float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-      for (index_t i = 0; i < h; i++) {
-        float *out_row = out_base + 2 * i * outw;
-
-        float *out_row_0 = out_row;
-        float *out_row_1 = out_row_0 + outw;
-        float *out_row_2 = out_row_1 + outw;
-        float *out_row_3 = out_row_2 + outw;
-
-        index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-        for (index_t n = 0; n + 9 < outw; n += 8) {
-          float32x4_t in_vec = vld1q_f32(in);
-
-          // row 0
-          float32x4x2_t out0 = vld2q_f32(out_row_0);
-          out0.val[0] =
-            neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
-          out0.val[1] =
-            neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
-          vst2q_f32(out_row_0, out0);
-          out0 = vld2q_f32(out_row_0 + 2);
-          out0.val[0] =
-            neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
-          out0.val[1] =
-            neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
-          vst2q_f32(out_row_0 + 2, out0);
-
-          // row 1
-          float32x4x2_t out1 = vld2q_f32(out_row_1);
-          out1.val[0] =
-            neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
-          out1.val[1] =
-            neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
-          vst2q_f32(out_row_1, out1);
-          out1 = vld2q_f32(out_row_1 + 2);
-          out1.val[0] =
-            neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
-          out1.val[1] =
-            neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
-          vst2q_f32(out_row_1 + 2, out1);
-
-          // row 2
-          float32x4x2_t out2 = vld2q_f32(out_row_2);
-          out2.val[0] =
-            neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
-          out2.val[1] =
-            neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
-          vst2q_f32(out_row_2, out2);
-          out2 = vld2q_f32(out_row_2 + 2);
-          out2.val[0] =
-            neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
-          out2.val[1] =
-            neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
-          vst2q_f32(out_row_2 + 2, out2);
-
-          // row 3
-          float32x4x2_t out3 = vld2q_f32(out_row_3);
-          out3.val[0] =
-            neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
-          out3.val[1] =
-            neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
-          vst2q_f32(out_row_3, out3);
-          out3 = vld2q_f32(out_row_3 + 2);
-          out3.val[0] =
-            neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
-          out3.val[1] =
-            neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
-          vst2q_f32(out_row_3 + 2, out3);
-
-          in += 4;
-          out_row_0 += 8;
-          out_row_1 += 8;
-          out_row_2 += 8;
-          out_row_3 += 8;
-          j += 4;
-        }
-#endif
-        for (; j < w; j++) {
-          float val = in[0];
-          for (int k = 0; k < 4; ++k) {
-            out_row_0[k] += val * k0[k];
-            out_row_1[k] += val * k1[k];
-            out_row_2[k] += val * k2[k];
-            out_row_3[k] += val * k3[k];
-          }
-          in++;
-          out_row_0 += 2;
-          out_row_1 += 2;
-          out_row_2 += 2;
-          out_row_3 += 2;
-        }
-      }
-    }
-  }
-}
-
-void GroupDeconv2dNeonK4x4S1(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output) {
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t inch = in_shape[1];
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t outch = out_shape[1];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group;
-  const index_t outch_g = outch / group;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (int g = 0; g < group; ++g) {
-      for (index_t oc = 0; oc < outch_g; oc += 2) {
-        if (oc + 1 < outch_g) {
-          const index_t out_offset =
-              (b * outch + outch_g * g + oc) * out_img_size;
-          float *out_base = output + out_offset;
-          float *out_base1 = out_base + out_img_size;
-          for (index_t ic = 0; ic < inch_g; ic++) {
-            const index_t in_offset =
-                (b * inch + inch_g * g + ic) * in_img_size;
-            const float *input_base = input + in_offset;
-            const float *in = input_base;
-            const index_t kernel_offset =
-                ((oc * group + g) * inch_g  + ic) * 16;
-            const float *kernel_base = filter + kernel_offset;
-            const float *k0 = kernel_base;
-            const float *k1 = kernel_base + 4;
-            const float *k2 = kernel_base + 8;
-            const float *k3 = kernel_base + 12;
-
-            const float *kernel_base1 = kernel_base + inch * 16;
-            const float *k10 = kernel_base1;
-            const float *k11 = kernel_base1 + 4;
-            const float *k12 = kernel_base1 + 8;
-            const float *k13 = kernel_base1 + 12;
-#if defined(MACE_ENABLE_NEON)
-            float32x4_t k0_vec = vld1q_f32(k0);
-            float32x4_t k1_vec = vld1q_f32(k1);
-            float32x4_t k2_vec = vld1q_f32(k2);
-            float32x4_t k3_vec = vld1q_f32(k3);
-
-            float32x4_t k10_vec = vld1q_f32(k10);
-            float32x4_t k11_vec = vld1q_f32(k11);
-            float32x4_t k12_vec = vld1q_f32(k12);
-            float32x4_t k13_vec = vld1q_f32(k13);
-#endif
-            for (index_t i = 0; i < h; i++) {
-              float *out_row = out_base + i * outw;
-
-              float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
-              float *out_row_3 = out_row_2 + outw;
-
-              float *out_row1 = out_base1 + i * outw;
-
-              float *out_row1_0 = out_row1;
-              float *out_row1_1 = out_row1_0 + outw;
-              float *out_row1_2 = out_row1_1 + outw;
-              float *out_row1_3 = out_row1_2 + outw;
-
-              index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-              float32x4_t out00, out01, out02, out03;
-              float32x4_t out10, out11, out12, out13;
-
-              out00 = vld1q_f32(out_row_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row_0, out00);
-
-              out10 = vld1q_f32(out_row1_0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
-              vst1q_f32(out_row1_0, out10);
-
-              out01 = vld1q_f32(out_row_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_0 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 1, out11);
-
-              out02 = vld1q_f32(out_row_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_0 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 2, out12);
-
-              out03 = vld1q_f32(out_row_0 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_0 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
-              vst1q_f32(out_row1_0 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_1);
-              out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
-              vst1q_f32(out_row_1, out00);
-
-              out10 = vld1q_f32(out_row1_1);
-              out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
-              vst1q_f32(out_row1_1, out10);
-
-              out01 = vld1q_f32(out_row_1 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 1, out11);
-
-              out02 = vld1q_f32(out_row_1 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 2, out12);
-
-              out03 = vld1q_f32(out_row_1 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_1 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
-              vst1q_f32(out_row1_1 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_2 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 0, out00);
-
-              out10 = vld1q_f32(out_row1_2 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 0, out10);
-
-              out01 = vld1q_f32(out_row_2 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_2 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 1, out11);
-
-              out02 = vld1q_f32(out_row_2 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_2 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 2, out12);
-
-              out03 = vld1q_f32(out_row_2 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_2 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
-              vst1q_f32(out_row1_2 + 3, out13);
-
-              //
-              out00 = vld1q_f32(out_row_3 + 0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 0, out00);
-
-              out10 = vld1q_f32(out_row1_3 + 0);
-              out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 0, out10);
-
-              out01 = vld1q_f32(out_row_3 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 1, out01);
-
-              out11 = vld1q_f32(out_row1_3 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 1, out11);
-
-              out02 = vld1q_f32(out_row_3 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 2, out02);
-
-              out12 = vld1q_f32(out_row1_3 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 2, out12);
-
-              out03 = vld1q_f32(out_row_3 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 3, out03);
-
-              out13 = vld1q_f32(out_row1_3 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
-              vst1q_f32(out_row1_3 + 3, out13);
-
-              in += 4;
-              out_row_0 += 4;
-              out_row_1 += 4;
-              out_row_2 += 4;
-              out_row_3 += 4;
-              out_row1_0 += 4;
-              out_row1_1 += 4;
-              out_row1_2 += 4;
-              out_row1_3 += 4;
-            }
-#endif
-              for (; j < w; j++) {
-                float val = in[0];
-                for (int k = 0; k < 4; ++k) {
-                  out_row_0[k] += val * k0[k];
-                  out_row_1[k] += val * k1[k];
-                  out_row_2[k] += val * k2[k];
-                  out_row_3[k] += val * k3[k];
-                  out_row1_0[k] += val * k10[k];
-                  out_row1_1[k] += val * k11[k];
-                  out_row1_2[k] += val * k12[k];
-                  out_row1_3[k] += val * k13[k];
-                }
-                in++;
-                out_row_0++;
-                out_row_1++;
-                out_row_2++;
-                out_row_3++;
-                out_row1_0++;
-                out_row1_1++;
-                out_row1_2++;
-                out_row1_3++;
-              }
-            }
-          }
-        } else {
-          const index_t out_offset =
-              (b * outch + outch_g * g + oc) * out_img_size;
-          float *out_base = output + out_offset;
-          for (index_t ic = 0; ic < inch_g; ++ic) {
-            const index_t in_offset =
-                (b * inch + inch_g * g + ic) * in_img_size;
-            const index_t kernel_offset =
-                ((oc * group + g) * inch_g  + ic) * 16;
-
-            const float *input_base = input + in_offset;
-            const float *kernel_base = filter + kernel_offset;
-            const float *in = input_base;
-            const float *k0 = kernel_base;
-            const float *k1 = kernel_base + 4;
-            const float *k2 = kernel_base + 8;
-            const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-            float32x4_t k0_vec = vld1q_f32(k0);
-            float32x4_t k1_vec = vld1q_f32(k1);
-            float32x4_t k2_vec = vld1q_f32(k2);
-            float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-            for (index_t i = 0; i < h; i++) {
-              float *out_row = out_base + i * outw;
-              float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
-              float *out_row_3 = out_row_2 + outw;
-              index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; j + 3 < w; j += 4) {
-              float32x4_t in_vec = vld1q_f32(in);
-
-              float32x4_t out00 = vld1q_f32(out_row_0);
-              out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
-              vst1q_f32(out_row_0, out00);
-
-              float32x4_t out01 = vld1q_f32(out_row_0 + 1);
-              out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 1, out01);
-
-              float32x4_t out02 = vld1q_f32(out_row_0 + 2);
-              out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 2, out02);
-
-              float32x4_t out03 = vld1q_f32(out_row_0 + 3);
-              out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
-              vst1q_f32(out_row_0 + 3, out03);
-
-              //
-              float32x4_t out10 = vld1q_f32(out_row_1);
-              out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
-              vst1q_f32(out_row_1, out10);
-
-              float32x4_t out11 = vld1q_f32(out_row_1 + 1);
-              out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 1, out11);
-
-              float32x4_t out12 = vld1q_f32(out_row_1 + 2);
-              out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 2, out12);
-
-              float32x4_t out13 = vld1q_f32(out_row_1 + 3);
-              out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
-              vst1q_f32(out_row_1 + 3, out13);
-
-              //
-              float32x4_t out20 = vld1q_f32(out_row_2 + 0);
-              out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 0, out20);
-
-              float32x4_t out21 = vld1q_f32(out_row_2 + 1);
-              out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 1, out21);
-
-              float32x4_t out22 = vld1q_f32(out_row_2 + 2);
-              out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 2, out22);
-
-              float32x4_t out23 = vld1q_f32(out_row_2 + 3);
-              out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
-              vst1q_f32(out_row_2 + 3, out23);
-
-              //
-              float32x4_t out30 = vld1q_f32(out_row_3 + 0);
-              out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 0, out30);
-
-              float32x4_t out31 = vld1q_f32(out_row_3 + 1);
-              out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 1, out31);
-
-              float32x4_t out32 = vld1q_f32(out_row_3 + 2);
-              out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 2, out32);
-
-              float32x4_t out33 = vld1q_f32(out_row_3 + 3);
-              out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
-              vst1q_f32(out_row_3 + 3, out33);
-
-              in += 4;
-              out_row_0 += 4;
-              out_row_1 += 4;
-              out_row_2 += 4;
-              out_row_3 += 4;
-            }
-#endif
-              for (; j < w; j++) {
-                float val = in[0];
-                for (int k = 0; k < 4; ++k) {
-                  out_row_0[k] += val * k0[k];
-                  out_row_1[k] += val * k1[k];
-                  out_row_2[k] += val * k2[k];
-                  out_row_3[k] += val * k3[k];
-                }
-                in++;
-                out_row_0++;
-                out_row_1++;
-                out_row_2++;
-                out_row_3++;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void GroupDeconv2dNeonK4x4S2(const float *input,
-                             const float *filter,
-                             const int group,
-                             const index_t *in_shape,
-                             const index_t *out_shape,
-                             float *output) {
-  const index_t w = in_shape[3];
-  const index_t h = in_shape[2];
-  const index_t inch = in_shape[1];
-
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group;
-  const index_t outch_g = outch / group;
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (int g = 0; g < group; ++g) {
-      for (index_t oc = 0; oc < outch_g; oc++) {
-        const index_t out_offset =
-            (b * outch + outch_g * g + oc) * out_img_size;
-        float *out_base = output + out_offset;
-        for (index_t ic = 0; ic < inch_g; ic++) {
-          const index_t in_offset =
-              (b * inch + inch_g * g + ic) * in_img_size;
-          const index_t kernel_offset =
-              ((oc * group + g) * inch_g  + ic) * 16;
-          const float *input_base = input + in_offset;
-          const float *kernel_base = filter + kernel_offset;
-          const float *in = input_base;
-
-          const float *k0 = kernel_base;
-          const float *k1 = kernel_base + 4;
-          const float *k2 = kernel_base + 8;
-          const float *k3 = kernel_base + 12;
-#if defined(MACE_ENABLE_NEON)
-          float32x4_t k0_vec = vld1q_f32(k0);
-          float32x4_t k1_vec = vld1q_f32(k1);
-          float32x4_t k2_vec = vld1q_f32(k2);
-          float32x4_t k3_vec = vld1q_f32(k3);
-#endif
-          for (index_t i = 0; i < h; i++) {
-            float *out_row = out_base + 2 * i * outw;
-
-            float *out_row_0 = out_row;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
-            float *out_row_3 = out_row_2 + outw;
-
-            index_t j = 0;
-#if defined(MACE_ENABLE_NEON)
-            for (index_t n = 0; n + 9 < outw; n += 8) {
-            float32x4_t in_vec = vld1q_f32(in);
-
-            // row 0
-            float32x4x2_t out0 = vld2q_f32(out_row_0);
-            out0.val[0] =
-              neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
-            out0.val[1] =
-              neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0, out0);
-            out0 = vld2q_f32(out_row_0 + 2);
-            out0.val[0] =
-              neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
-            out0.val[1] =
-              neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
-            vst2q_f32(out_row_0 + 2, out0);
-
-            // row 1
-            float32x4x2_t out1 = vld2q_f32(out_row_1);
-            out1.val[0] =
-              neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
-            out1.val[1] =
-              neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1, out1);
-            out1 = vld2q_f32(out_row_1 + 2);
-            out1.val[0] =
-              neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
-            out1.val[1] =
-              neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
-            vst2q_f32(out_row_1 + 2, out1);
-
-            // row 2
-            float32x4x2_t out2 = vld2q_f32(out_row_2);
-            out2.val[0] =
-              neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
-            out2.val[1] =
-              neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2, out2);
-            out2 = vld2q_f32(out_row_2 + 2);
-            out2.val[0] =
-              neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
-            out2.val[1] =
-              neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
-            vst2q_f32(out_row_2 + 2, out2);
-
-            // row 3
-            float32x4x2_t out3 = vld2q_f32(out_row_3);
-            out3.val[0] =
-              neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
-            out3.val[1] =
-              neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
-            vst2q_f32(out_row_3, out3);
-            out3 = vld2q_f32(out_row_3 + 2);
-            out3.val[0] =
-              neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
-            out3.val[1] =
-              neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
-            vst2q_f32(out_row_3 + 2, out3);
-
-            in += 4;
-            out_row_0 += 8;
-            out_row_1 += 8;
-            out_row_2 += 8;
-            out_row_3 += 8;
-            j += 4;
-          }
-#endif
-            for (; j < w; j++) {
-              float val = in[0];
-              for (int k = 0; k < 4; ++k) {
-                out_row_0[k] += val * k0[k];
-                out_row_1[k] += val * k1[k];
-                out_row_2[k] += val * k2[k];
-                out_row_3[k] += val * k3[k];
-              }
-              in++;
-              out_row_0 += 2;
-              out_row_1 += 2;
-              out_row_2 += 2;
-              out_row_3 += 2;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cac3badb523262663820b93e2527588f49be4923
--- /dev/null
+++ b/mace/ops/arm/fp32/activation.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/activation.h"
+
+#include <arm_neon.h>
+#include <algorithm>
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+Activation::Activation(ActivationType type,
+                       const float limit,
+                       const float leakyrelu_coefficient)
+    : type_(type),
+      limit_(limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+
+MaceStatus Activation::Compute(const OpContext *context,
+                               const Tensor *input,
+                               Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    Tensor::MappingGuard output_guard(output);
+    DoActivation(context, input, output);
+  } else {
+    DoActivation(context, input, output);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Activation::DoActivation(const OpContext *context,
+                              const Tensor *input,
+                              Tensor *output) {
+  auto input_data = input->data<float>();
+  auto output_data = output->mutable_data<float>();
+  const index_t size = input->size();
+
+  utils::ThreadPool &thread_pool =
+      context->device()->cpu_runtime()->thread_pool();
+
+  switch (type_) {
+    case RELU: {
+      const float32x4_t vzero = vdupq_n_f32(0.f);
+      const index_t block_count = size / 4;
+
+      thread_pool.Compute1D(
+          [=](index_t start, index_t end, index_t step) {
+            auto input_ptr = input_data + start * 4;
+            auto output_ptr = output_data + start * 4;
+
+            for (index_t i = start; i < end; i += step) {
+              float32x4_t v = vld1q_f32(input_ptr);
+              v = vmaxq_f32(v, vzero);
+              vst1q_f32(output_ptr, v);
+
+              input_ptr += 4;
+              output_ptr += 4;
+            }
+          },
+          0, block_count, 1);
+
+      // remain
+      for (index_t i = block_count * 4; i < size; ++i) {
+        output_data[i] = std::max(0.f, input_data[i]);
+      }
+
+      break;
+    }
+
+    case RELUX: {
+      const float32x4_t vzero = vdupq_n_f32(0.f);
+      const float32x4_t vlimit = vdupq_n_f32(limit_);
+      const index_t block_count = size / 4;
+
+      thread_pool.Compute1D(
+          [=](index_t start, index_t end, index_t step) {
+            auto input_ptr = input_data + start * 4;
+            auto output_ptr = output_data + start * 4;
+
+            for (index_t i = start; i < end; i += step) {
+              float32x4_t v = vld1q_f32(input_ptr);
+              v = vmaxq_f32(v, vzero);
+              v = vminq_f32(v, vlimit);
+              vst1q_f32(output_ptr, v);
+
+              input_ptr += 4;
+              output_ptr += 4;
+            }
+          },
+          0, block_count, 1);
+
+      // remain
+      for (index_t i = block_count * 4; i < size; ++i) {
+        output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
+      }
+
+      break;
+    }
+
+    case LEAKYRELU: {
+      const float32x4_t vzero = vdupq_n_f32(0.f);
+      const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
+      const index_t block_count = size / 4;
+
+      thread_pool.Compute1D(
+          [=](index_t start, index_t end, index_t step) {
+            auto input_ptr = input_data + start * 4;
+            auto output_ptr = output_data + start * 4;
+
+            for (index_t i = start; i < end; i += step) {
+              float32x4_t v = vld1q_f32(input_ptr);
+              float32x4_t u = vminq_f32(v, vzero);
+              v = vmaxq_f32(v, vzero);
+              v = vmlaq_f32(v, valpha, u);
+              vst1q_f32(output_ptr, v);
+
+              input_ptr += 4;
+              output_ptr += 4;
+            }
+          },
+          0, block_count, 1);
+
+      // remain
+      for (index_t i = block_count * 4; i < size; ++i) {
+        output_data[i] = std::max(input_data[i], 0.f) +
+                         std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
+      }
+
+      break;
+    }
+
+    case TANH: {
+      thread_pool.Compute1D(
+          [=](index_t start, index_t end, index_t step) {
+            for (index_t i = start; i < end; i += step) {
+              output_data[i] = std::tanh(input_data[i]);
+            }
+          },
+          0, size, 1);
+
+      break;
+    }
+
+    case SIGMOID: {
+      thread_pool.Compute1D(
+          [=](index_t start, index_t end, index_t step) {
+            for (index_t i = start; i < end; i += step) {
+              output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
+            }
+          },
+          0, size, 1);
+
+      break;
+    }
+
+    case NOOP:
+      break;
+
+    default:
+      MACE_NOT_IMPLEMENTED;
+  }
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/activation.h b/mace/ops/arm/fp32/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..265915d0c3a8d3bdbab3e4c0d0f60521730dec34
--- /dev/null
+++ b/mace/ops/arm/fp32/activation.h
@@ -0,0 +1,53 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_ACTIVATION_H_
+#define MACE_OPS_ARM_FP32_ACTIVATION_H_
+
+#include "mace/core/op_context.h"
+#include "mace/ops/common/activation_type.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Activation {
+ public:
+  explicit Activation(ActivationType type,
+                      const float limit,
+                      const float leakyrelu_coefficient);
+  ~Activation() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      Tensor *output);
+
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input,
+                    Tensor *output);
+
+  ActivationType type_;
+  const float limit_;
+  const float leakyrelu_coefficient_;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_ACTIVATION_H_
diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de4b6d575b194b253243cdfb3ffe7ceebec3f045
--- /dev/null
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -0,0 +1,95 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/bias_add.h"
+
+#include <arm_neon.h>
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus BiasAdd::Compute(const OpContext *context,
+                            const Tensor *input,
+                            const Tensor *bias,
+                            Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  Tensor::MappingGuard bias_guard(bias);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    if (bias == nullptr) {
+      output->Copy(*input);
+    } else {
+      Tensor::MappingGuard output_guard(output);
+      AddBias(context, input, bias, output);
+    }
+  } else {
+    if (bias != nullptr) {
+      AddBias(context, input, bias, output);
+    }
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void BiasAdd::AddBias(const OpContext *context,
+                      const Tensor *input,
+                      const Tensor *bias,
+                      mace::Tensor *output) {
+  auto input_data = input->data<float>();
+  auto bias_data = bias->data<float>();
+  auto output_data = output->mutable_data<float>();
+
+  const index_t batch = input->dim(0);
+  const index_t channels = input->dim(1);
+  const index_t height = output->dim(2);
+  const index_t width = output->dim(3);
+  const index_t image_size = height * width;
+  const index_t block_count = image_size / 4;
+  const index_t remain = image_size % 4;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = (b * channels + c) * image_size;
+        auto input_ptr = input_data + offset;
+        auto output_ptr = output_data + offset;
+        const float bias = bias_data[c];
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        for (index_t i = 0; i < block_count; ++i) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vaddq_f32(v, vbias);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+        for (index_t i = 0; i < remain; ++i) {
+          (*output_ptr++) = (*input_ptr++) + bias;
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/bias_add.h b/mace/ops/arm/fp32/bias_add.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3e6849157472bc9df8117299cf3f0d01ca203d8
--- /dev/null
+++ b/mace/ops/arm/fp32/bias_add.h
@@ -0,0 +1,48 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_BIAS_ADD_H_
+#define MACE_OPS_ARM_FP32_BIAS_ADD_H_
+
+#include "mace/core/op_context.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class BiasAdd {
+ public:
+  BiasAdd() = default;
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output);
+
+ private:
+  void AddBias(const OpContext *context,
+               const Tensor *input,
+               const Tensor *bias,
+               Tensor *output);
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_BIAS_ADD_H_
diff --git a/mace/ops/arm/common_neon.h b/mace/ops/arm/fp32/common_neon.h
similarity index 90%
rename from mace/ops/arm/common_neon.h
rename to mace/ops/arm/fp32/common_neon.h
index 8d28f5581c6ad43dd90fe1965e16e6ab7bec48c8..8ac2cb7c787bf386fb15678bfd014ae760933dba 100644
--- a/mace/ops/arm/common_neon.h
+++ b/mace/ops/arm/fp32/common_neon.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_COMMON_NEON_H_
-#define MACE_OPS_ARM_COMMON_NEON_H_
+#ifndef MACE_OPS_ARM_FP32_COMMON_NEON_H_
+#define MACE_OPS_ARM_FP32_COMMON_NEON_H_
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
@@ -21,6 +21,8 @@
 
 namespace mace {
 namespace ops {
+namespace arm {
+namespace fp32 {
 
 #ifdef MACE_ENABLE_NEON
 inline float32x4_t neon_vfma_lane_0(float32x4_t a,
@@ -64,7 +66,9 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 }
 #endif
 
+}  // namespace fp32
+}  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_COMMON_NEON_H_
+#endif  // MACE_OPS_ARM_FP32_COMMON_NEON_H_
diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/fp32/conv_2d.cc
index 2602279423dc753be085c66bf67bb4cbee86bcc7..357b47754b0b9bf814302be042f56651883594a5 100644
--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/fp32/conv_2d.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d.h"
+
 #include <memory>
 #include <utility>
 #include <algorithm>
 
-#include "mace/ops/arm/fp32/conv_2d.h"
 #include "mace/utils/memory.h"
 
 namespace mace {
@@ -195,7 +196,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
 void Conv2dBase::PadInput(const Tensor &src,
                           const int pad_top,
                           const int pad_left,
-                          mace::Tensor *dst) {
+                          Tensor *dst) {
   if (dst == &src) return;
   const index_t batch = src.dim(0);
   const index_t channels = src.dim(1);
@@ -211,7 +212,6 @@ void Conv2dBase::PadInput(const Tensor &src,
   const index_t img_size = height * width;
   const index_t padded_img_size = padded_height * padded_width;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < batch; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t bc = b * channels + c;
@@ -238,7 +238,7 @@ void Conv2dBase::PadInput(const Tensor &src,
   }
 }
 
-void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
+void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
   if (dst == &src) return;
   const index_t batch = dst->dim(0);
   const index_t channels = dst->dim(1);
@@ -253,7 +253,6 @@ void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
   const index_t img_size = height * width;
   const index_t padded_img_size = padded_height * padded_width;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < batch; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t bc = (b * channels + c);
diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h
index 1383767bf278f1f6c11aec8047732aca98afa45a..dc8d0effd101e77df88473c884fcdb670768379e 100644
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
@@ -31,9 +31,9 @@ namespace fp32 {
 
 class Conv2dBase {
  public:
-  Conv2dBase(const std::vector<int> strides,
-             const std::vector<int> dilations,
-             const std::vector<int> paddings,
+  Conv2dBase(const std::vector<int> &strides,
+             const std::vector<int> &dilations,
+             const std::vector<int> &paddings,
              const Padding padding_type)
       : strides_(strides),
         dilations_(dilations),
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.h b/mace/ops/arm/fp32/conv_2d_1x1.h
index 68b792fd96b3c5dd77504614894d3008bbd01e01..cde94ea01927ad544bb347eaea53bcb55b01f7f8 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
@@ -29,7 +29,7 @@ namespace fp32 {
 
 class Conv2dK1x1 : public Conv2dBase {
  public:
-  Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK1x1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK1x1() {}
 
@@ -37,7 +37,7 @@ class Conv2dK1x1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 
  private:
   Gemm gemm_;
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc
index 1ff99d8021438d8b851b65d6ee2c662e01e72917..3be9e3eb5dca7ecf4ecf66b1371796872c5cd0b5 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d_1xn.h"
+
 #include <arm_neon.h>
 #include <memory>
-#include "mace/ops/arm/fp32/conv_2d_1xn.h"
 
 namespace mace {
 namespace ops {
@@ -37,11 +38,11 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -53,82 +54,90 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 1 height x 4 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
+          float *out_ptr0_base =
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+            /* load filter (4 outch x 1 height x 4 width) */
+            float32x4_t vf00, vf01;
+            float32x4_t vf10, vf11;
+            float32x4_t vf20, vf21;
+            float32x4_t vf30, vf31;
+            vf00 = vld1q_f32(filter_ptr0);
+            vf01 = vld1q_f32(filter_ptr0 + 3);
+            vf10 = vld1q_f32(filter_ptr1);
+            vf11 = vld1q_f32(filter_ptr1 + 3);
+            vf20 = vld1q_f32(filter_ptr2);
+            vf21 = vld1q_f32(filter_ptr2 + 3);
+            vf30 = vld1q_f32(filter_ptr3);
+            vf31 = vld1q_f32(filter_ptr3 + 3);
+
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0, vo1, vo2, vo3;
+                // load output
+                index_t out_offset = h * out_width + w;
+                vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                vo1 = vld1q_f32(out_ptr1_base + out_offset);
+                vo2 = vld1q_f32(out_ptr2_base + out_offset);
+                vo3 = vld1q_f32(out_ptr3_base + out_offset);
+
+                // input (3 slide)
+                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+                // input offset
+                index_t in_offset = h * in_width + w;
+                // load input
+                vi0 = vld1q_f32(in_ptr_base + in_offset);
+                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                vi1 = vextq_f32(vi0, vi4, 1);
+                vi2 = vextq_f32(vi0, vi4, 2);
+                vi3 = vextq_f32(vi0, vi4, 3);
+                vi5 = vextq_f32(vi4, vi8, 1);
+                vi6 = vextq_f32(vi4, vi8, 2);
 
 #if defined(__aarch64__)
-              /* outch 0 */
+                /* outch 0 */
               vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
               vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
               vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
@@ -161,92 +170,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
               vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
               vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
 #else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
-            const float
-                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
-            /* load filter (1 outch x 1 height x 4 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load output
-                index_t out_offset = h * out_width + w;
-                vo0 = vld1q_f32(out_ptr0_base + out_offset);
-
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
+                /* outch 0 */
                 vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
                 vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
                 vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
@@ -254,16 +178,103 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
                 vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
                 vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
                 vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+                /* outch 1 */
+                vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+                /* outch 2 */
+                vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+                /* outch 3 */
+                vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
 #endif
 
                 vst1q_f32(out_ptr0_base + out_offset, vo0);
+                vst1q_f32(out_ptr1_base + out_offset, vo1);
+                vst1q_f32(out_ptr2_base + out_offset, vo2);
+                vst1q_f32(out_ptr3_base + out_offset, vo3);
               }  // w
             }    // h
           }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+              /* load filter (1 outch x 1 height x 4 width) */
+              float32x4_t vf00, vf01;
+              vf00 = vld1q_f32(filter_ptr0);
+              vf01 = vld1q_f32(filter_ptr0 + 3);
+
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float32x4_t vo0;
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
+
+                  // input (3 slide)
+                  float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
+                  // input offset
+                  index_t in_offset = h * in_width + w;
+                  // load input
+                  vi0 = vld1q_f32(in_ptr_base + in_offset);
+                  vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                  vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                  vi1 = vextq_f32(vi0, vi4, 1);
+                  vi2 = vextq_f32(vi0, vi4, 2);
+                  vi3 = vextq_f32(vi0, vi4, 3);
+                  vi5 = vextq_f32(vi4, vi8, 1);
+                  vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                  vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                  vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                  vst1q_f32(out_ptr0_base + out_offset, vo0);
+                }  // w
+              }    // h
+            }  // c
+          }
+        }  // if
+      }    // m
+    }      // b
+  }, 0, batch, 1, 0, out_channels, 4);
+
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
@@ -284,11 +295,11 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -300,206 +311,84 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
-          const float
-              *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 4 height x 1 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t w = 0; w < out_width; ++w) {
-              // load output
-              index_t out_offset = h * out_width + w;
-              // output (4 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                 out_ptr0_base[out_offset + out_width],
-                                 out_ptr0_base[out_offset + 2 * out_width],
-                                 out_ptr0_base[out_offset + 3 * out_width]};
-              float32x4_t vo1 = {out_ptr1_base[out_offset],
-                                 out_ptr1_base[out_offset + out_width],
-                                 out_ptr1_base[out_offset + 2 * out_width],
-                                 out_ptr1_base[out_offset + 3 * out_width]};
-              float32x4_t vo2 = {out_ptr2_base[out_offset],
-                                 out_ptr2_base[out_offset + out_width],
-                                 out_ptr2_base[out_offset + 2 * out_width],
-                                 out_ptr2_base[out_offset + 3 * out_width]};
-              float32x4_t vo3 = {out_ptr3_base[out_offset],
-                                 out_ptr3_base[out_offset + out_width],
-                                 out_ptr3_base[out_offset + 2 * out_width],
-                                 out_ptr3_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-
-              out_ptr0_base[out_offset] = vo0[0];
-              out_ptr0_base[out_offset + out_width] = vo0[1];
-              out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-              out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
-              out_ptr1_base[out_offset] = vo1[0];
-              out_ptr1_base[out_offset + out_width] = vo1[1];
-              out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
-              out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
-              out_ptr2_base[out_offset] = vo2[0];
-              out_ptr2_base[out_offset + out_width] = vo2[1];
-              out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
-              out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
-              out_ptr3_base[out_offset] = vo3[0];
-              out_ptr3_base[out_offset + out_width] = vo3[1];
-              out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
-              out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
-            /* load filter (1 outch x 4 height x 1 width) */
+                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+            const float
+                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+            /* load filter (4 outch x 4 height x 1 width) */
             float32x4_t vf00, vf01;
+            float32x4_t vf10, vf11;
+            float32x4_t vf20, vf21;
+            float32x4_t vf30, vf31;
             vf00 = vld1q_f32(filter_ptr0);
             vf01 = vld1q_f32(filter_ptr0 + 3);
+            vf10 = vld1q_f32(filter_ptr1);
+            vf11 = vld1q_f32(filter_ptr1 + 3);
+            vf20 = vld1q_f32(filter_ptr2);
+            vf21 = vld1q_f32(filter_ptr2 + 3);
+            vf30 = vld1q_f32(filter_ptr3);
+            vf31 = vld1q_f32(filter_ptr3 + 3);
 
             for (index_t h = 0; h + 3 < out_height; h += 4) {
               for (index_t w = 0; w < out_width; ++w) {
                 // load output
                 index_t out_offset = h * out_width + w;
-                // output (1 outch x 4 height x 1 width): vo_outch_height
+                // output (4 outch x 4 height x 1 width): vo_outch_height
                 float32x4_t vo0 = {out_ptr0_base[out_offset],
                                    out_ptr0_base[out_offset + out_width],
                                    out_ptr0_base[out_offset + 2 * out_width],
                                    out_ptr0_base[out_offset + 3 * out_width]};
+                float32x4_t vo1 = {out_ptr1_base[out_offset],
+                                   out_ptr1_base[out_offset + out_width],
+                                   out_ptr1_base[out_offset + 2 * out_width],
+                                   out_ptr1_base[out_offset + 3 * out_width]};
+                float32x4_t vo2 = {out_ptr2_base[out_offset],
+                                   out_ptr2_base[out_offset + out_width],
+                                   out_ptr2_base[out_offset + 2 * out_width],
+                                   out_ptr2_base[out_offset + 3 * out_width]};
+                float32x4_t vo3 = {out_ptr3_base[out_offset],
+                                   out_ptr3_base[out_offset + out_width],
+                                   out_ptr3_base[out_offset + 2 * out_width],
+                                   out_ptr3_base[out_offset + 3 * out_width]};
 
                 // input offset
                 index_t in_offset = h * in_width + w;
@@ -513,9 +402,7 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
                                    in_ptr_base[in_offset + 6 * in_width],
                                    in_ptr_base[in_offset + 7 * in_width]};
                 float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                   in_ptr_base[in_offset + 9 * in_width],
-                                   in_ptr_base[in_offset + 10 * in_width],
-                                   in_ptr_base[in_offset + 11 * in_width]};
+                                   in_ptr_base[in_offset + 9 * in_width]};
                 float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                 float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                 float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -523,6 +410,7 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
                 float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
 
 #if defined(__aarch64__)
+                /* outch 0 */
                 vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
                 vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
                 vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
@@ -530,7 +418,32 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
                 vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
                 vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
                 vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+                /* outch 1 */
+                vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
+                vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
+                vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
+                vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
+                vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
+                vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
+                vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
+                /* outch 2 */
+                vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
+                vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
+                vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
+                vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
+                vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
+                vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
+                vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
+                /* outch 3 */
+                vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
+                vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
+                vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
+                vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
+                vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
+                vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
+                vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
 #else
+                /* outch 0 */
                 vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
                 vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
                 vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
@@ -538,26 +451,131 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
                 vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
                 vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
                 vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+                /* outch 1 */
+                vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
+                vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
+                vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
+                /* outch 2 */
+                vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
+                vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
+                vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
+                /* outch 3 */
+                vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
+                vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
+                vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
 #endif
 
                 out_ptr0_base[out_offset] = vo0[0];
                 out_ptr0_base[out_offset + out_width] = vo0[1];
                 out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
                 out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                out_ptr1_base[out_offset] = vo1[0];
+                out_ptr1_base[out_offset + out_width] = vo1[1];
+                out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
+                out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
+                out_ptr2_base[out_offset] = vo2[0];
+                out_ptr2_base[out_offset + out_width] = vo2[1];
+                out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
+                out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
+                out_ptr3_base[out_offset] = vo3[0];
+                out_ptr3_base[out_offset + out_width] = vo3[1];
+                out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
+                out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
               }  // w
             }    // h
           }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+              /* load filter (1 outch x 4 height x 1 width) */
+              float32x4_t vf00, vf01;
+              vf00 = vld1q_f32(filter_ptr0);
+              vf01 = vld1q_f32(filter_ptr0 + 3);
+
+              for (index_t h = 0; h + 3 < out_height; h += 4) {
+                for (index_t w = 0; w < out_width; ++w) {
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  // output (1 outch x 4 height x 1 width): vo_outch_height
+                  float32x4_t vo0 = {out_ptr0_base[out_offset],
+                                     out_ptr0_base[out_offset + out_width],
+                                     out_ptr0_base[out_offset + 2 * out_width],
+                                     out_ptr0_base[out_offset + 3 * out_width]};
+
+                  // input offset
+                  index_t in_offset = h * in_width + w;
+                  // input (3 slide)
+                  float32x4_t vi0 = {in_ptr_base[in_offset],
+                                     in_ptr_base[in_offset + in_width],
+                                     in_ptr_base[in_offset + 2 * in_width],
+                                     in_ptr_base[in_offset + 3 * in_width]};
+                  float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                     in_ptr_base[in_offset + 5 * in_width],
+                                     in_ptr_base[in_offset + 6 * in_width],
+                                     in_ptr_base[in_offset + 7 * in_width]};
+                  float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                     in_ptr_base[in_offset + 9 * in_width],
+                                     in_ptr_base[in_offset + 10 * in_width],
+                                     in_ptr_base[in_offset + 11 * in_width]};
+                  float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+                  float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+                  float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+                  float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+                  float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                  vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
+                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
+                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
+                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
+                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
+#else
+                  vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
+                  vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
+                  vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
+#endif
+
+                  out_ptr0_base[out_offset] = vo0[0];
+                  out_ptr0_base[out_offset + out_width] = vo0[1];
+                  out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+                  out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                }  // w
+              }    // h
+            }  // c
+          }
+        }  // if
+      }    // m
+    }      // b
+  }, 0, batch, 1, 0, out_channels, 4);
+
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-
-// ====
-
 MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
                                   const Tensor *input,
                                   const Tensor *filter,
@@ -590,91 +608,104 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
 
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
   const index_t tile_height =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (1 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo;
-              // load output
-              index_t out_offset = (h + ht) * out_width + w;
-              vo = vld1q_f32(out_ptr_base + out_offset);
-
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
-                  vi10, vi11, vi12, vi13, vi14, vi16;
-              // input offset
-              index_t in_offset = (h + ht) * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
-              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-              vi7 = vextq_f32(vi4, vi8, 3);
-              vi9 = vextq_f32(vi8, vi12, 1);
-              vi10 = vextq_f32(vi8, vi12, 2);
-              vi11 = vextq_f32(vi8, vi12, 3);
-              vi13 = vextq_f32(vi12, vi16, 1);
-              vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              vst1q_f32(out_ptr_base + out_offset, vo);
-            }  // w
-          }    // ht
-        }  // c
-      }    // h
-    }      // m
-  }        // b
+      out_channels < 4 ? RoundUpDiv4(out_height) : out_height;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        for (index_t h = 0; h < out_height; h += tile_height) {
+          float *out_ptr_base =
+              output_data + b * out_batch_size + m * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+            /* load filter (1 outch x 4 height x 1 width) */
+            float32x4_t vf0, vf1, vf2, vf3;
+            vf0 = vld1q_f32(filter_ptr);
+            vf1 = vld1q_f32(filter_ptr + 4);
+            vf2 = vld1q_f32(filter_ptr + 8);
+            vf3 = vld1q_f32(filter_ptr + 11);
+
+            for (index_t ht = 0; ht < tile_height && h + ht < out_height;
+                 ++ht) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // output (1 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo;
+                // load output
+                index_t out_offset = (h + ht) * out_width + w;
+                vo = vld1q_f32(out_ptr_base + out_offset);
+
+                // input (3 slide)
+                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
+                    vi10, vi11, vi12, vi13, vi14, vi16;
+                // input offset
+                index_t in_offset = (h + ht) * in_width + w;
+                // load input
+                vi0 = vld1q_f32(in_ptr_base + in_offset);
+                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
+                vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
+                vi1 = vextq_f32(vi0, vi4, 1);
+                vi2 = vextq_f32(vi0, vi4, 2);
+                vi3 = vextq_f32(vi0, vi4, 3);
+                vi5 = vextq_f32(vi4, vi8, 1);
+                vi6 = vextq_f32(vi4, vi8, 2);
+                vi7 = vextq_f32(vi4, vi8, 3);
+                vi9 = vextq_f32(vi8, vi12, 1);
+                vi10 = vextq_f32(vi8, vi12, 2);
+                vi11 = vextq_f32(vi8, vi12, 3);
+                vi13 = vextq_f32(vi12, vi16, 1);
+                vi14 = vextq_f32(vi12, vi16, 2);
+
+                vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+                vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+                vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+                vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+                vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+                vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+                vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+                vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+                vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+                vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+                vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+                vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+                vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+                vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+                vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+                vst1q_f32(out_ptr_base + out_offset, vo);
+              }  // w
+            }    // ht
+          }  // c
+        }    // h
+      }      // m
+    }        // b
+  }, 0, batch, 1, 0, out_channels, 1);
+
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
@@ -711,106 +742,119 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
 
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
   const index_t tile_width =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
-
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
-              // load output
-              index_t out_offset = h * out_width + w + wt;
-              // output (1 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo = {out_ptr_base[out_offset],
-                                out_ptr_base[out_offset + out_width],
-                                out_ptr_base[out_offset + 2 * out_width],
-                                out_ptr_base[out_offset + 3 * out_width]};
-
-              // input offset
-              index_t in_offset = h * in_width + w + wt;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width],
-                                 in_ptr_base[in_offset + 10 * in_width],
-                                 in_ptr_base[in_offset + 11 * in_width]};
-              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                  in_ptr_base[in_offset + 13 * in_width],
-                                  in_ptr_base[in_offset + 14 * in_width],
-                                  in_ptr_base[in_offset + 15 * in_width]};
-              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                  in_ptr_base[in_offset + 17 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
-              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
-              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
-              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
-              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
-              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
-
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-
-              out_ptr_base[out_offset] = vo[0];
-              out_ptr_base[out_offset + out_width] = vo[1];
-              out_ptr_base[out_offset + 2 * out_width] = vo[2];
-              out_ptr_base[out_offset + 3 * out_width] = vo[3];
-            }  // wt
-          }    // h
-        }  // c
-      }    // w
-    }      // m
-  }        // b
+      out_channels < 4 ? RoundUpDiv4(out_width) : out_width;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        for (index_t w = 0; w < out_width; w += tile_width) {
+          float *out_ptr_base =
+              output_data + b * out_batch_size + m * out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+            /* load filter (1 outch x 4 height x 1 width) */
+            float32x4_t vf0, vf1, vf2, vf3;
+            vf0 = vld1q_f32(filter_ptr);
+            vf1 = vld1q_f32(filter_ptr + 4);
+            vf2 = vld1q_f32(filter_ptr + 8);
+            vf3 = vld1q_f32(filter_ptr + 11);
+
+            for (index_t h = 0; h + 3 < out_height; h += 4) {
+              for (index_t wt = 0; wt < tile_width && w + wt < out_width;
+                   ++wt) {
+                // load output
+                index_t out_offset = h * out_width + w + wt;
+                // output (1 outch x 4 height x 1 width): vo_outch_height
+                float32x4_t vo = {out_ptr_base[out_offset],
+                                  out_ptr_base[out_offset + out_width],
+                                  out_ptr_base[out_offset + 2 * out_width],
+                                  out_ptr_base[out_offset + 3 * out_width]};
+
+                // input offset
+                index_t in_offset = h * in_width + w + wt;
+                // input (3 slide)
+                float32x4_t vi0 = {in_ptr_base[in_offset],
+                                   in_ptr_base[in_offset + in_width],
+                                   in_ptr_base[in_offset + 2 * in_width],
+                                   in_ptr_base[in_offset + 3 * in_width]};
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                   in_ptr_base[in_offset + 5 * in_width],
+                                   in_ptr_base[in_offset + 6 * in_width],
+                                   in_ptr_base[in_offset + 7 * in_width]};
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                   in_ptr_base[in_offset + 9 * in_width],
+                                   in_ptr_base[in_offset + 10 * in_width],
+                                   in_ptr_base[in_offset + 11 * in_width]};
+                float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                                    in_ptr_base[in_offset + 13 * in_width],
+                                    in_ptr_base[in_offset + 14 * in_width],
+                                    in_ptr_base[in_offset + 15 * in_width]};
+                float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                                    in_ptr_base[in_offset + 17 * in_width]};
+                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+                float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+                float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+                float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
+                float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
+                float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
+                float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
+                float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
+                float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
+
+                vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+                vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+                vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+                vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+                vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+                vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+                vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+                vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+                vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+                vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+                vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+                vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+                vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+                vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+                vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+                out_ptr_base[out_offset] = vo[0];
+                out_ptr_base[out_offset + out_width] = vo[1];
+                out_ptr_base[out_offset + 2 * out_width] = vo[2];
+                out_ptr_base[out_offset + 3 * out_width] = vo[3];
+              }  // wt
+            }    // h
+          }  // c
+        }    // w
+      }      // m
+    }        // b
+  }, 0, batch, 1, 0, out_channels, 1);
+
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/fp32/conv_2d_1xn.h
index a4a5e8995f9ebf5b85c2622684c13e558eb2900f..0bdd66737907627f7dd44e1cb94c24803ea0c8fc 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
@@ -28,7 +28,7 @@ namespace fp32 {
 
 class Conv2dK1x7S1 : public Conv2dBase {
  public:
-  Conv2dK1x7S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK1x7S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK1x7S1() {}
 
@@ -36,12 +36,12 @@ class Conv2dK1x7S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK7x1S1 : public Conv2dBase {
  public:
-  Conv2dK7x1S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK7x1S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK7x1S1() {}
 
@@ -49,12 +49,12 @@ class Conv2dK7x1S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK1x15S1 : public Conv2dBase {
  public:
-  Conv2dK1x15S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK1x15S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK1x15S1() {}
 
@@ -62,12 +62,12 @@ class Conv2dK1x15S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK15x1S1 : public Conv2dBase {
  public:
-  Conv2dK15x1S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK15x1S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK15x1S1() {}
 
@@ -75,7 +75,7 @@ class Conv2dK15x1S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc
index a8ce5fa64074c08362d0e839a80d111221bc19cb..95c3034138d9ecab67d1aae0ee770ff07ab20788 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d_3x3.h"
+
 #include <arm_neon.h>
 #include <memory>
-#include "mace/ops/arm/fp32/conv_2d_3x3.h"
 
 namespace mace {
 namespace ops {
@@ -36,11 +37,11 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -52,291 +53,41 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 2) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 1 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float
-              *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
-
-          float *out_ptr1 = out_ptr1_base;
-          const float *in_ptr1 =
-              input_data + b * in_batch_size + c * in_image_size + 1 * in_width;
-          const float *in_ptr2 =
-              input_data + b * in_batch_size + c * in_image_size + 2 * in_width;
-          const float *in_ptr3 =
-              input_data + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float
-              *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
-
-#if defined(__aarch64__)
-          float *out_ptr0 = out_ptr0_base;
-
-          // load filter (2 outch x 3 height x 3 width): vf_outch_height
-          float32x4_t vf00, vf01, vf02;
-          float32x4_t vf10, vf11, vf12;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf02 = vld1q_f32(filter_ptr0 + 6);
-
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf12 = vld1q_f32(filter_ptr1 + 6);
-
-          for (index_t h = 0; h + 1 < out_height; h += 2) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input (4 height x 3 slide): vi_height_slide
-              float32x4_t vi00, vi01, vi02;  // reg count: 14
-              float32x4_t vi10, vi11, vi12;
-              float32x4_t vi20, vi21, vi22;
-              float32x4_t vi30, vi31, vi32;
-              float32x4_t vo20, vo30;  // tmp use
-
-              // output (4 outch x 2 height x 4 width): vo_outch_height
-              float32x4_t vo00, vo01;
-              float32x4_t vo10, vo11;
-
-              // load input
-              vi00 = vld1q_f32(in_ptr0);
-              vo00 = vld1q_f32(in_ptr0 + 4);  // reuse vo00: vi0n
-              vi10 = vld1q_f32(in_ptr1);
-              vo10 = vld1q_f32(in_ptr1 + 4);
-              vi20 = vld1q_f32(in_ptr2);
-              vo20 = vld1q_f32(in_ptr2 + 4);
-              vi30 = vld1q_f32(in_ptr3);
-              vo30 = vld1q_f32(in_ptr3 + 4);
-
-              vi01 = vextq_f32(vi00, vo00, 1);
-              vi02 = vextq_f32(vi00, vo00, 2);
-              vi11 = vextq_f32(vi10, vo10, 1);
-              vi12 = vextq_f32(vi10, vo10, 2);
-              vi21 = vextq_f32(vi20, vo20, 1);
-              vi22 = vextq_f32(vi20, vo20, 2);
-              vi31 = vextq_f32(vi30, vo30, 1);
-              vi32 = vextq_f32(vi30, vo30, 2);
-
-              // load ouptut
-              vo00 = vld1q_f32(out_ptr0);
-              vo01 = vld1q_f32(out_ptr0 + out_width);
-              vo10 = vld1q_f32(out_ptr1);
-              vo11 = vld1q_f32(out_ptr1 + out_width);
-
-              // outch 0, height 0
-              vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);  // reg count: 18
-              vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
-              vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
-              vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
-              vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
-              vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
-              vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
-              vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
-              vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
-
-              // outch 0, height 1
-              vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
-              vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
-              vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
-              vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
-              vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
-              vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
-              vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
-              vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
-              vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
-
-              // outch 1, height 0
-              vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0);
-              vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1);
-              vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2);
-              vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0);
-              vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1);
-              vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2);
-              vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0);
-              vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1);
-              vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2);
-
-              // outch 1, height 1
-              vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0);
-              vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1);
-              vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2);
-              vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0);
-              vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1);
-              vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2);
-              vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0);
-              vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1);
-              vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
-
-              vst1q_f32(out_ptr0, vo00);
-              vst1q_f32(out_ptr0 + out_width, vo01);
-              vst1q_f32(out_ptr1, vo10);
-              vst1q_f32(out_ptr1 + out_width, vo11);
-
-              in_ptr0 += 4;
-              in_ptr1 += 4;
-              in_ptr2 += 4;
-              in_ptr3 += 4;
-
-              out_ptr0 += 4;
-              out_ptr1 += 4;
-            }  // w
-
-            in_ptr0 += 2 + in_width;
-            in_ptr1 += 2 + in_width;
-            in_ptr2 += 2 + in_width;
-            in_ptr3 += 2 + in_width;
-
-            out_ptr0 += out_width;
-            out_ptr1 += out_width;
-          }                      // h
-#else  // arm v7
-          float *out_ptr0 = out_ptr0_base;
-
-          // load filter (2 outch x 3 height x 3 width): vf_outch_height
-          float32x2_t vf001, vf023, vf045, vf067, vf089;
-          float32x2_t vf101, vf123, vf145, vf167, vf189;
-          vf001 = vld1_f32(filter_ptr0);
-          vf023 = vld1_f32(filter_ptr0 + 2);
-          vf045 = vld1_f32(filter_ptr0 + 4);
-          vf067 = vld1_f32(filter_ptr0 + 6);
-          vf089 = vld1_f32(filter_ptr0 + 8);
-
-          vf101 = vld1_f32(filter_ptr1);
-          vf123 = vld1_f32(filter_ptr1 + 2);
-          vf145 = vld1_f32(filter_ptr1 + 4);
-          vf167 = vld1_f32(filter_ptr1 + 6);
-          vf189 = vld1_f32(filter_ptr1 + 8);
-
-          for (index_t h = 0; h + 1 < out_height; h += 2) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input (4 height x 3 slide): vi_height_slide
-              float32x4_t vi00, vi01, vi02;  // reg count: 14
-              float32x4_t vi10, vi11, vi12;
-              float32x4_t vi20, vi21, vi22;
-              float32x4_t vi30, vi31, vi32;
-              float32x4_t vo20, vo30;  // tmp use
-
-              // output (4 outch x 2 height x 4 width): vo_outch_height
-              float32x4_t vo00, vo01;
-              float32x4_t vo10, vo11;
-
-              // load input
-              vi00 = vld1q_f32(in_ptr0);
-              vo00 = vld1q_f32(in_ptr0 + 4);  // reuse vo00: vi0n
-              vi10 = vld1q_f32(in_ptr1);
-              vo10 = vld1q_f32(in_ptr1 + 4);
-              vi20 = vld1q_f32(in_ptr2);
-              vo20 = vld1q_f32(in_ptr2 + 4);
-              vi30 = vld1q_f32(in_ptr3);
-              vo30 = vld1q_f32(in_ptr3 + 4);
-
-              vi01 = vextq_f32(vi00, vo00, 1);
-              vi02 = vextq_f32(vi00, vo00, 2);
-              vi11 = vextq_f32(vi10, vo10, 1);
-              vi12 = vextq_f32(vi10, vo10, 2);
-              vi21 = vextq_f32(vi20, vo20, 1);
-              vi22 = vextq_f32(vi20, vo20, 2);
-              vi31 = vextq_f32(vi30, vo30, 1);
-              vi32 = vextq_f32(vi30, vo30, 2);
-
-              // load ouptut
-              vo00 = vld1q_f32(out_ptr0);
-              vo01 = vld1q_f32(out_ptr0 + out_width);
-              vo10 = vld1q_f32(out_ptr1);
-              vo11 = vld1q_f32(out_ptr1 + out_width);
-
-              // outch 0, height 0
-              vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0);
-              vo00 = vmlaq_lane_f32(vo00, vi01, vf001, 1);
-              vo00 = vmlaq_lane_f32(vo00, vi02, vf023, 0);
-              vo00 = vmlaq_lane_f32(vo00, vi10, vf023, 1);
-              vo00 = vmlaq_lane_f32(vo00, vi11, vf045, 0);
-              vo00 = vmlaq_lane_f32(vo00, vi12, vf045, 1);
-              vo00 = vmlaq_lane_f32(vo00, vi20, vf067, 0);
-              vo00 = vmlaq_lane_f32(vo00, vi21, vf067, 1);
-              vo00 = vmlaq_lane_f32(vo00, vi22, vf089, 0);
-
-              // outch 0, height 1
-              vo01 = vmlaq_lane_f32(vo01, vi10, vf001, 0);
-              vo01 = vmlaq_lane_f32(vo01, vi11, vf001, 1);
-              vo01 = vmlaq_lane_f32(vo01, vi12, vf023, 0);
-              vo01 = vmlaq_lane_f32(vo01, vi20, vf023, 1);
-              vo01 = vmlaq_lane_f32(vo01, vi21, vf045, 0);
-              vo01 = vmlaq_lane_f32(vo01, vi22, vf045, 1);
-              vo01 = vmlaq_lane_f32(vo01, vi30, vf067, 0);
-              vo01 = vmlaq_lane_f32(vo01, vi31, vf067, 1);
-              vo01 = vmlaq_lane_f32(vo01, vi32, vf089, 0);
-
-              // outch 1, height 0
-              vo10 = vmlaq_lane_f32(vo10, vi00, vf101, 0);
-              vo10 = vmlaq_lane_f32(vo10, vi01, vf101, 1);
-              vo10 = vmlaq_lane_f32(vo10, vi02, vf123, 0);
-              vo10 = vmlaq_lane_f32(vo10, vi10, vf123, 1);
-              vo10 = vmlaq_lane_f32(vo10, vi11, vf145, 0);
-              vo10 = vmlaq_lane_f32(vo10, vi12, vf145, 1);
-              vo10 = vmlaq_lane_f32(vo10, vi20, vf167, 0);
-              vo10 = vmlaq_lane_f32(vo10, vi21, vf167, 1);
-              vo10 = vmlaq_lane_f32(vo10, vi22, vf189, 0);
-
-              // outch 1, height 1
-              vo11 = vmlaq_lane_f32(vo11, vi10, vf101, 0);
-              vo11 = vmlaq_lane_f32(vo11, vi11, vf101, 1);
-              vo11 = vmlaq_lane_f32(vo11, vi12, vf123, 0);
-              vo11 = vmlaq_lane_f32(vo11, vi20, vf123, 1);
-              vo11 = vmlaq_lane_f32(vo11, vi21, vf145, 0);
-              vo11 = vmlaq_lane_f32(vo11, vi22, vf145, 1);
-              vo11 = vmlaq_lane_f32(vo11, vi30, vf167, 0);
-              vo11 = vmlaq_lane_f32(vo11, vi31, vf167, 1);
-              vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0);
-
-              vst1q_f32(out_ptr0, vo00);
-              vst1q_f32(out_ptr0 + out_width, vo01);
-              vst1q_f32(out_ptr1, vo10);
-              vst1q_f32(out_ptr1 + out_width, vo11);
-
-              in_ptr0 += 4;
-              in_ptr1 += 4;
-              in_ptr2 += 4;
-              in_ptr3 += 4;
-
-              out_ptr0 += 4;
-              out_ptr1 += 4;
-            }  // w
-
-            in_ptr0 += 2 + in_width;
-            in_ptr1 += 2 + in_width;
-            in_ptr2 += 2 + in_width;
-            in_ptr3 += 2 + in_width;
-
-            out_ptr0 += out_width;
-            out_ptr1 += out_width;
-          }  // h
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 1 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr0 =
-                input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+            const float
+                *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
+
+            float *out_ptr1 = out_ptr1_base;
             const float *in_ptr1 =
                 input_data + b * in_batch_size + c * in_image_size
                     + 1 * in_width;
@@ -347,61 +98,70 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 input_data + b * in_batch_size + c * in_image_size
                     + 3 * in_width;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
+                *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
 
 #if defined(__aarch64__)
             float *out_ptr0 = out_ptr0_base;
 
-            // load filter (1 outch x 3 height x 3 width): vf_outch_height
+            // load filter (2 outch x 3 height x 3 width): vf_outch_height
             float32x4_t vf00, vf01, vf02;
+            float32x4_t vf10, vf11, vf12;
             vf00 = vld1q_f32(filter_ptr0);
             vf01 = vld1q_f32(filter_ptr0 + 3);
-            vf02 = vld1q_f32(filter_ptr0 + 5);
+            vf02 = vld1q_f32(filter_ptr0 + 6);
+
+            vf10 = vld1q_f32(filter_ptr1);
+            vf11 = vld1q_f32(filter_ptr1 + 3);
+            vf12 = vld1q_f32(filter_ptr1 + 6);
 
             for (index_t h = 0; h + 1 < out_height; h += 2) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input (4 height x 3 slide): vi_height_slide
-                float32x4_t vi00, vi01, vi02, vi0n;
-                float32x4_t vi10, vi11, vi12, vi1n;
-                float32x4_t vi20, vi21, vi22, vi2n;
-                float32x4_t vi30, vi31, vi32, vi3n;
+                float32x4_t vi00, vi01, vi02;  // reg count: 14
+                float32x4_t vi10, vi11, vi12;
+                float32x4_t vi20, vi21, vi22;
+                float32x4_t vi30, vi31, vi32;
+                float32x4_t vo20, vo30;  // tmp use
 
-                // output (1 outch x 2 height x 4 width): vo_outch_height
+                // output (4 outch x 2 height x 4 width): vo_outch_height
                 float32x4_t vo00, vo01;
+                float32x4_t vo10, vo11;
 
                 // load input
                 vi00 = vld1q_f32(in_ptr0);
-                vi0n = vld1q_f32(in_ptr0 + 4);
+                vo00 = vld1q_f32(in_ptr0 + 4);  // reuse vo00: vi0n
                 vi10 = vld1q_f32(in_ptr1);
-                vi1n = vld1q_f32(in_ptr1 + 4);
+                vo10 = vld1q_f32(in_ptr1 + 4);
                 vi20 = vld1q_f32(in_ptr2);
-                vi2n = vld1q_f32(in_ptr2 + 4);
+                vo20 = vld1q_f32(in_ptr2 + 4);
                 vi30 = vld1q_f32(in_ptr3);
-                vi3n = vld1q_f32(in_ptr3 + 4);
+                vo30 = vld1q_f32(in_ptr3 + 4);
 
-                vi01 = vextq_f32(vi00, vi0n, 1);
-                vi02 = vextq_f32(vi00, vi0n, 2);
-                vi11 = vextq_f32(vi10, vi1n, 1);
-                vi12 = vextq_f32(vi10, vi1n, 2);
-                vi21 = vextq_f32(vi20, vi2n, 1);
-                vi22 = vextq_f32(vi20, vi2n, 2);
-                vi31 = vextq_f32(vi30, vi3n, 1);
-                vi32 = vextq_f32(vi30, vi3n, 2);
+                vi01 = vextq_f32(vi00, vo00, 1);
+                vi02 = vextq_f32(vi00, vo00, 2);
+                vi11 = vextq_f32(vi10, vo10, 1);
+                vi12 = vextq_f32(vi10, vo10, 2);
+                vi21 = vextq_f32(vi20, vo20, 1);
+                vi22 = vextq_f32(vi20, vo20, 2);
+                vi31 = vextq_f32(vi30, vo30, 1);
+                vi32 = vextq_f32(vi30, vo30, 2);
 
                 // load ouptut
                 vo00 = vld1q_f32(out_ptr0);
                 vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo10 = vld1q_f32(out_ptr1);
+                vo11 = vld1q_f32(out_ptr1 + out_width);
 
                 // outch 0, height 0
-                vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
+                vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);  // reg count: 18
                 vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
                 vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
                 vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
                 vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
                 vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
-                vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
-                vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
-                vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
+                vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
+                vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
+                vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
 
                 // outch 0, height 1
                 vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
@@ -410,12 +170,36 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
                 vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
                 vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
-                vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
-                vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
-                vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
+                vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
+                vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
+                vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
+
+                // outch 1, height 0
+                vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0);
+                vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1);
+                vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2);
+                vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0);
+                vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1);
+                vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2);
+                vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0);
+                vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1);
+                vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2);
+
+                // outch 1, height 1
+                vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0);
+                vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1);
+                vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2);
+                vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0);
+                vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1);
+                vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2);
+                vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0);
+                vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1);
+                vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
 
                 vst1q_f32(out_ptr0, vo00);
                 vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr1, vo10);
+                vst1q_f32(out_ptr1 + out_width, vo11);
 
                 in_ptr0 += 4;
                 in_ptr1 += 4;
@@ -423,6 +207,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 in_ptr3 += 4;
 
                 out_ptr0 += 4;
+                out_ptr1 += 4;
               }  // w
 
               in_ptr0 += 2 + in_width;
@@ -431,76 +216,112 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
               in_ptr3 += 2 + in_width;
 
               out_ptr0 += out_width;
-            }                    // h
+              out_ptr1 += out_width;
+            }                      // h
 #else  // arm v7
             float *out_ptr0 = out_ptr0_base;
 
-            // load filter (1 outch x 3 height x 3 width): vf_outch_height
-            float32x2_t vf01, vf23, vf45, vf67, vf78;
-            vf01 = vld1_f32(filter_ptr0);
-            vf23 = vld1_f32(filter_ptr0 + 2);
-            vf45 = vld1_f32(filter_ptr0 + 4);
-            vf67 = vld1_f32(filter_ptr0 + 6);
-            vf78 = vld1_f32(filter_ptr0 + 7);
+            // load filter (2 outch x 3 height x 3 width): vf_outch_height
+            float32x2_t vf001, vf023, vf045, vf067, vf089;
+            float32x2_t vf101, vf123, vf145, vf167, vf189;
+            vf001 = vld1_f32(filter_ptr0);
+            vf023 = vld1_f32(filter_ptr0 + 2);
+            vf045 = vld1_f32(filter_ptr0 + 4);
+            vf067 = vld1_f32(filter_ptr0 + 6);
+            vf089 = vld1_f32(filter_ptr0 + 8);
+
+            vf101 = vld1_f32(filter_ptr1);
+            vf123 = vld1_f32(filter_ptr1 + 2);
+            vf145 = vld1_f32(filter_ptr1 + 4);
+            vf167 = vld1_f32(filter_ptr1 + 6);
+            vf189 = vld1_f32(filter_ptr1 + 8);
 
             for (index_t h = 0; h + 1 < out_height; h += 2) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input (4 height x 3 slide): vi_height_slide
-                float32x4_t vi00, vi01, vi02, vi0n;
-                float32x4_t vi10, vi11, vi12, vi1n;
-                float32x4_t vi20, vi21, vi22, vi2n;
-                float32x4_t vi30, vi31, vi32, vi3n;
+                float32x4_t vi00, vi01, vi02;  // reg count: 14
+                float32x4_t vi10, vi11, vi12;
+                float32x4_t vi20, vi21, vi22;
+                float32x4_t vi30, vi31, vi32;
+                float32x4_t vo20, vo30;  // tmp use
 
-                // output (1 outch x 2 height x 4 width): vo_outch_height
+                // output (4 outch x 2 height x 4 width): vo_outch_height
                 float32x4_t vo00, vo01;
+                float32x4_t vo10, vo11;
 
                 // load input
                 vi00 = vld1q_f32(in_ptr0);
-                vi0n = vld1q_f32(in_ptr0 + 4);
+                vo00 = vld1q_f32(in_ptr0 + 4);  // reuse vo00: vi0n
                 vi10 = vld1q_f32(in_ptr1);
-                vi1n = vld1q_f32(in_ptr1 + 4);
+                vo10 = vld1q_f32(in_ptr1 + 4);
                 vi20 = vld1q_f32(in_ptr2);
-                vi2n = vld1q_f32(in_ptr2 + 4);
+                vo20 = vld1q_f32(in_ptr2 + 4);
                 vi30 = vld1q_f32(in_ptr3);
-                vi3n = vld1q_f32(in_ptr3 + 4);
+                vo30 = vld1q_f32(in_ptr3 + 4);
 
-                vi01 = vextq_f32(vi00, vi0n, 1);
-                vi02 = vextq_f32(vi00, vi0n, 2);
-                vi11 = vextq_f32(vi10, vi1n, 1);
-                vi12 = vextq_f32(vi10, vi1n, 2);
-                vi21 = vextq_f32(vi20, vi2n, 1);
-                vi22 = vextq_f32(vi20, vi2n, 2);
-                vi31 = vextq_f32(vi30, vi3n, 1);
-                vi32 = vextq_f32(vi30, vi3n, 2);
+                vi01 = vextq_f32(vi00, vo00, 1);
+                vi02 = vextq_f32(vi00, vo00, 2);
+                vi11 = vextq_f32(vi10, vo10, 1);
+                vi12 = vextq_f32(vi10, vo10, 2);
+                vi21 = vextq_f32(vi20, vo20, 1);
+                vi22 = vextq_f32(vi20, vo20, 2);
+                vi31 = vextq_f32(vi30, vo30, 1);
+                vi32 = vextq_f32(vi30, vo30, 2);
 
                 // load ouptut
                 vo00 = vld1q_f32(out_ptr0);
                 vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo10 = vld1q_f32(out_ptr1);
+                vo11 = vld1q_f32(out_ptr1 + out_width);
 
                 // outch 0, height 0
-                vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0);
-                vo00 = vmlaq_lane_f32(vo00, vi01, vf01, 1);
-                vo00 = vmlaq_lane_f32(vo00, vi02, vf23, 0);
-                vo00 = vmlaq_lane_f32(vo00, vi10, vf23, 1);
-                vo00 = vmlaq_lane_f32(vo00, vi11, vf45, 0);
-                vo00 = vmlaq_lane_f32(vo00, vi12, vf45, 1);
-                vo00 = vmlaq_lane_f32(vo00, vi20, vf67, 0);
-                vo00 = vmlaq_lane_f32(vo00, vi21, vf67, 1);
-                vo00 = vmlaq_lane_f32(vo00, vi22, vf78, 1);
+                vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0);
+                vo00 = vmlaq_lane_f32(vo00, vi01, vf001, 1);
+                vo00 = vmlaq_lane_f32(vo00, vi02, vf023, 0);
+                vo00 = vmlaq_lane_f32(vo00, vi10, vf023, 1);
+                vo00 = vmlaq_lane_f32(vo00, vi11, vf045, 0);
+                vo00 = vmlaq_lane_f32(vo00, vi12, vf045, 1);
+                vo00 = vmlaq_lane_f32(vo00, vi20, vf067, 0);
+                vo00 = vmlaq_lane_f32(vo00, vi21, vf067, 1);
+                vo00 = vmlaq_lane_f32(vo00, vi22, vf089, 0);
 
                 // outch 0, height 1
-                vo01 = vmlaq_lane_f32(vo01, vi10, vf01, 0);
-                vo01 = vmlaq_lane_f32(vo01, vi11, vf01, 1);
-                vo01 = vmlaq_lane_f32(vo01, vi12, vf23, 0);
-                vo01 = vmlaq_lane_f32(vo01, vi20, vf23, 1);
-                vo01 = vmlaq_lane_f32(vo01, vi21, vf45, 0);
-                vo01 = vmlaq_lane_f32(vo01, vi22, vf45, 1);
-                vo01 = vmlaq_lane_f32(vo01, vi30, vf67, 0);
-                vo01 = vmlaq_lane_f32(vo01, vi31, vf67, 1);
-                vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);
+                vo01 = vmlaq_lane_f32(vo01, vi10, vf001, 0);
+                vo01 = vmlaq_lane_f32(vo01, vi11, vf001, 1);
+                vo01 = vmlaq_lane_f32(vo01, vi12, vf023, 0);
+                vo01 = vmlaq_lane_f32(vo01, vi20, vf023, 1);
+                vo01 = vmlaq_lane_f32(vo01, vi21, vf045, 0);
+                vo01 = vmlaq_lane_f32(vo01, vi22, vf045, 1);
+                vo01 = vmlaq_lane_f32(vo01, vi30, vf067, 0);
+                vo01 = vmlaq_lane_f32(vo01, vi31, vf067, 1);
+                vo01 = vmlaq_lane_f32(vo01, vi32, vf089, 0);
+
+                // outch 1, height 0
+                vo10 = vmlaq_lane_f32(vo10, vi00, vf101, 0);
+                vo10 = vmlaq_lane_f32(vo10, vi01, vf101, 1);
+                vo10 = vmlaq_lane_f32(vo10, vi02, vf123, 0);
+                vo10 = vmlaq_lane_f32(vo10, vi10, vf123, 1);
+                vo10 = vmlaq_lane_f32(vo10, vi11, vf145, 0);
+                vo10 = vmlaq_lane_f32(vo10, vi12, vf145, 1);
+                vo10 = vmlaq_lane_f32(vo10, vi20, vf167, 0);
+                vo10 = vmlaq_lane_f32(vo10, vi21, vf167, 1);
+                vo10 = vmlaq_lane_f32(vo10, vi22, vf189, 0);
+
+                // outch 1, height 1
+                vo11 = vmlaq_lane_f32(vo11, vi10, vf101, 0);
+                vo11 = vmlaq_lane_f32(vo11, vi11, vf101, 1);
+                vo11 = vmlaq_lane_f32(vo11, vi12, vf123, 0);
+                vo11 = vmlaq_lane_f32(vo11, vi20, vf123, 1);
+                vo11 = vmlaq_lane_f32(vo11, vi21, vf145, 0);
+                vo11 = vmlaq_lane_f32(vo11, vi22, vf145, 1);
+                vo11 = vmlaq_lane_f32(vo11, vi30, vf167, 0);
+                vo11 = vmlaq_lane_f32(vo11, vi31, vf167, 1);
+                vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0);
 
                 vst1q_f32(out_ptr0, vo00);
                 vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr1, vo10);
+                vst1q_f32(out_ptr1 + out_width, vo11);
 
                 in_ptr0 += 4;
                 in_ptr1 += 4;
@@ -508,6 +329,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 in_ptr3 += 4;
 
                 out_ptr0 += 4;
+                out_ptr1 += 4;
               }  // w
 
               in_ptr0 += 2 + in_width;
@@ -516,13 +338,204 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
               in_ptr3 += 2 + in_width;
 
               out_ptr0 += out_width;
+              out_ptr1 += out_width;
             }  // h
 #endif
           }  // c
-        }    // mm
-      }      // if
-    }        // m
-  }          // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr0 =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float *in_ptr1 =
+                  input_data + b * in_batch_size + c * in_image_size
+                      + 1 * in_width;
+              const float *in_ptr2 =
+                  input_data + b * in_batch_size + c * in_image_size
+                      + 2 * in_width;
+              const float *in_ptr3 =
+                  input_data + b * in_batch_size + c * in_image_size
+                      + 3 * in_width;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
+
+#if defined(__aarch64__)
+              float *out_ptr0 = out_ptr0_base;
+
+              // load filter (1 outch x 3 height x 3 width): vf_outch_height
+              float32x4_t vf00, vf01, vf02;
+              vf00 = vld1q_f32(filter_ptr0);
+              vf01 = vld1q_f32(filter_ptr0 + 3);
+              vf02 = vld1q_f32(filter_ptr0 + 5);
+
+              for (index_t h = 0; h + 1 < out_height; h += 2) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input (4 height x 3 slide): vi_height_slide
+                  float32x4_t vi00, vi01, vi02, vi0n;
+                  float32x4_t vi10, vi11, vi12, vi1n;
+                  float32x4_t vi20, vi21, vi22, vi2n;
+                  float32x4_t vi30, vi31, vi32, vi3n;
+
+                  // output (1 outch x 2 height x 4 width): vo_outch_height
+                  float32x4_t vo00, vo01;
+
+                  // load input
+                  vi00 = vld1q_f32(in_ptr0);
+                  vi0n = vld1q_f32(in_ptr0 + 4);
+                  vi10 = vld1q_f32(in_ptr1);
+                  vi1n = vld1q_f32(in_ptr1 + 4);
+                  vi20 = vld1q_f32(in_ptr2);
+                  vi2n = vld1q_f32(in_ptr2 + 4);
+                  vi30 = vld1q_f32(in_ptr3);
+                  vi3n = vld1q_f32(in_ptr3 + 4);
+
+                  vi01 = vextq_f32(vi00, vi0n, 1);
+                  vi02 = vextq_f32(vi00, vi0n, 2);
+                  vi11 = vextq_f32(vi10, vi1n, 1);
+                  vi12 = vextq_f32(vi10, vi1n, 2);
+                  vi21 = vextq_f32(vi20, vi2n, 1);
+                  vi22 = vextq_f32(vi20, vi2n, 2);
+                  vi31 = vextq_f32(vi30, vi3n, 1);
+                  vi32 = vextq_f32(vi30, vi3n, 2);
+
+                  // load ouptut
+                  vo00 = vld1q_f32(out_ptr0);
+                  vo01 = vld1q_f32(out_ptr0 + out_width);
+
+                  // outch 0, height 0
+                  vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
+                  vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
+                  vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
+                  vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
+                  vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
+                  vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
+                  vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
+                  vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
+                  vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
+
+                  // outch 0, height 1
+                  vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
+                  vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
+                  vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
+                  vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
+                  vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
+                  vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
+                  vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
+                  vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
+                  vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
+
+                  vst1q_f32(out_ptr0, vo00);
+                  vst1q_f32(out_ptr0 + out_width, vo01);
+
+                  in_ptr0 += 4;
+                  in_ptr1 += 4;
+                  in_ptr2 += 4;
+                  in_ptr3 += 4;
+
+                  out_ptr0 += 4;
+                }  // w
+
+                in_ptr0 += 2 + in_width;
+                in_ptr1 += 2 + in_width;
+                in_ptr2 += 2 + in_width;
+                in_ptr3 += 2 + in_width;
+
+                out_ptr0 += out_width;
+              }                    // h
+#else  // arm v7
+              float *out_ptr0 = out_ptr0_base;
+
+              // load filter (1 outch x 3 height x 3 width): vf_outch_height
+              float32x2_t vf01, vf23, vf45, vf67, vf78;
+              vf01 = vld1_f32(filter_ptr0);
+              vf23 = vld1_f32(filter_ptr0 + 2);
+              vf45 = vld1_f32(filter_ptr0 + 4);
+              vf67 = vld1_f32(filter_ptr0 + 6);
+              vf78 = vld1_f32(filter_ptr0 + 7);
+
+              for (index_t h = 0; h + 1 < out_height; h += 2) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input (4 height x 3 slide): vi_height_slide
+                  float32x4_t vi00, vi01, vi02, vi0n;
+                  float32x4_t vi10, vi11, vi12, vi1n;
+                  float32x4_t vi20, vi21, vi22, vi2n;
+                  float32x4_t vi30, vi31, vi32, vi3n;
+
+                  // output (1 outch x 2 height x 4 width): vo_outch_height
+                  float32x4_t vo00, vo01;
+
+                  // load input
+                  vi00 = vld1q_f32(in_ptr0);
+                  vi0n = vld1q_f32(in_ptr0 + 4);
+                  vi10 = vld1q_f32(in_ptr1);
+                  vi1n = vld1q_f32(in_ptr1 + 4);
+                  vi20 = vld1q_f32(in_ptr2);
+                  vi2n = vld1q_f32(in_ptr2 + 4);
+                  vi30 = vld1q_f32(in_ptr3);
+                  vi3n = vld1q_f32(in_ptr3 + 4);
+
+                  vi01 = vextq_f32(vi00, vi0n, 1);
+                  vi02 = vextq_f32(vi00, vi0n, 2);
+                  vi11 = vextq_f32(vi10, vi1n, 1);
+                  vi12 = vextq_f32(vi10, vi1n, 2);
+                  vi21 = vextq_f32(vi20, vi2n, 1);
+                  vi22 = vextq_f32(vi20, vi2n, 2);
+                  vi31 = vextq_f32(vi30, vi3n, 1);
+                  vi32 = vextq_f32(vi30, vi3n, 2);
+
+                  // load ouptut
+                  vo00 = vld1q_f32(out_ptr0);
+                  vo01 = vld1q_f32(out_ptr0 + out_width);
+
+                  // outch 0, height 0
+                  vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0);
+                  vo00 = vmlaq_lane_f32(vo00, vi01, vf01, 1);
+                  vo00 = vmlaq_lane_f32(vo00, vi02, vf23, 0);
+                  vo00 = vmlaq_lane_f32(vo00, vi10, vf23, 1);
+                  vo00 = vmlaq_lane_f32(vo00, vi11, vf45, 0);
+                  vo00 = vmlaq_lane_f32(vo00, vi12, vf45, 1);
+                  vo00 = vmlaq_lane_f32(vo00, vi20, vf67, 0);
+                  vo00 = vmlaq_lane_f32(vo00, vi21, vf67, 1);
+                  vo00 = vmlaq_lane_f32(vo00, vi22, vf78, 1);
+
+                  // outch 0, height 1
+                  vo01 = vmlaq_lane_f32(vo01, vi10, vf01, 0);
+                  vo01 = vmlaq_lane_f32(vo01, vi11, vf01, 1);
+                  vo01 = vmlaq_lane_f32(vo01, vi12, vf23, 0);
+                  vo01 = vmlaq_lane_f32(vo01, vi20, vf23, 1);
+                  vo01 = vmlaq_lane_f32(vo01, vi21, vf45, 0);
+                  vo01 = vmlaq_lane_f32(vo01, vi22, vf45, 1);
+                  vo01 = vmlaq_lane_f32(vo01, vi30, vf67, 0);
+                  vo01 = vmlaq_lane_f32(vo01, vi31, vf67, 1);
+                  vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);
+
+                  vst1q_f32(out_ptr0, vo00);
+                  vst1q_f32(out_ptr0 + out_width, vo01);
+
+                  in_ptr0 += 4;
+                  in_ptr1 += 4;
+                  in_ptr2 += 4;
+                  in_ptr3 += 4;
+
+                  out_ptr0 += 4;
+                }  // w
+
+                in_ptr0 += 2 + in_width;
+                in_ptr1 += 2 + in_width;
+                in_ptr2 += 2 + in_width;
+                in_ptr3 += 2 + in_width;
+
+                out_ptr0 += out_width;
+              }  // h
+#endif
+            }  // c
+          }    // mm
+        }      // if
+      }        // m
+    }          // b
+  }, 0, batch, 1, 0, out_channels, 2);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
@@ -544,11 +557,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -560,153 +573,163 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t c = 0; c < in_shape[1]; ++c) {
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const float
-            *in_base = input_data + b * in_batch_size + c * in_image_size;
-        const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
-        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float
+              *in_base = input_data + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
+          float
+              *out_base = output_data + b * out_batch_size + m * out_image_size;
 
 #if defined(__aarch64__)
-        // load filter (1 outch x 3 height x 3 width): vf_outch_height
-        float32x4_t vf00, vf01, vf02;
-        vf00 = vld1q_f32(filter_ptr);
-        vf01 = vld1q_f32(filter_ptr + 3);
-        vf02 = vld1q_f32(filter_ptr + 5);
-
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w + 3 < out_width; w += 4) {
-            float32x4x2_t vi0, vi1, vi2;
-            float32x4_t vi0n, vi1n, vi2n;
-
-            // input (3 height x 3 slide): vi_height_slide
-            float32x4_t vi00, vi01, vi02;
-            float32x4_t vi10, vi11, vi12;
-            float32x4_t vi20, vi21, vi22;
-
-            // output (1 outch x 1 height x 4 width): vo
-            float32x4_t vo;
-
-            // load input
-            index_t in_h = h * 2;
-            index_t in_w = w * 2;
-            index_t in_offset = in_h * in_width + in_w;
-            vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-            vi1 = vld2q_f32(in_base + in_offset + in_width);
-            vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
-
-            vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
-
-            // load ouptut
-            index_t out_offset = h * out_width + w;
-            vo = vld1q_f32(out_base + out_offset);
-
-            vi00 = vi0.val[0];                // [0.2.4.6]
-            vi01 = vi0.val[1];                // [1.3.5.7]
-            vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
-            vi10 = vi1.val[0];
-            vi11 = vi1.val[1];
-            vi12 = vextq_f32(vi10, vi1n, 1);
-            vi20 = vi2.val[0];
-            vi21 = vi2.val[1];
-            vi22 = vextq_f32(vi20, vi2n, 1);
-
-            // outch 0, height 0
-            vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
-            vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
-            vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
-            vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
-            vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
-            vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
-            vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
-            vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
-            vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
-
-            vst1q_f32(out_base + out_offset, vo);
-          }                      // w
-        }                        // h
+          // load filter (1 outch x 3 height x 3 width): vf_outch_height
+          float32x4_t vf00, vf01, vf02;
+          vf00 = vld1q_f32(filter_ptr);
+          vf01 = vld1q_f32(filter_ptr + 3);
+          vf02 = vld1q_f32(filter_ptr + 5);
+
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              float32x4x2_t vi0, vi1, vi2;
+              float32x4_t vi0n, vi1n, vi2n;
+
+              // input (3 height x 3 slide): vi_height_slide
+              float32x4_t vi00, vi01, vi02;
+              float32x4_t vi10, vi11, vi12;
+              float32x4_t vi20, vi21, vi22;
+
+              // output (1 outch x 1 height x 4 width): vo
+              float32x4_t vo;
+
+              // load input
+              index_t in_h = h * 2;
+              index_t in_w = w * 2;
+              index_t in_offset = in_h * in_width + in_w;
+              vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
+              vi1 = vld2q_f32(in_base + in_offset + in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+
+              vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
+              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+
+              // load ouptut
+              index_t out_offset = h * out_width + w;
+              vo = vld1q_f32(out_base + out_offset);
+
+              vi00 = vi0.val[0];                // [0.2.4.6]
+              vi01 = vi0.val[1];                // [1.3.5.7]
+              vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
+              vi10 = vi1.val[0];
+              vi11 = vi1.val[1];
+              vi12 = vextq_f32(vi10, vi1n, 1);
+              vi20 = vi2.val[0];
+              vi21 = vi2.val[1];
+              vi22 = vextq_f32(vi20, vi2n, 1);
+
+              // outch 0, height 0
+              vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
+              vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
+              vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
+              vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
+              vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
+              vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
+              vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
+              vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
+              vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
+
+              vst1q_f32(out_base + out_offset, vo);
+            }                      // w
+          }                        // h
 #else  // arm v7
-        // load filter (1 outch x 3 height x 3 width): vf_outch_height
-        float32x2_t vf01, vf23, vf45, vf67, vf78;
-        vf01 = vld1_f32(filter_ptr);
-        vf23 = vld1_f32(filter_ptr + 2);
-        vf45 = vld1_f32(filter_ptr + 4);
-        vf67 = vld1_f32(filter_ptr + 6);
-        vf78 = vld1_f32(filter_ptr + 7);
-
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w + 3 < out_width; w += 4) {
-            float32x4x2_t vi0, vi1, vi2;
-            float32x4_t vi0n, vi1n, vi2n;
-
-            // input (3 height x 3 slide): vi_height_slide
-            float32x4_t vi00, vi01, vi02;
-            float32x4_t vi10, vi11, vi12;
-            float32x4_t vi20, vi21, vi22;
-
-            // output (1 outch x 1 height x 4 width): vo
-            float32x4_t vo;
-
-            // load input
-            index_t in_h = h * 2;
-            index_t in_w = w * 2;
-            index_t in_offset = in_h * in_width + in_w;
-            vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-            vi1 = vld2q_f32(in_base + in_offset + in_width);
-            vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
-
-            vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
-
-            // load ouptut
-            index_t out_offset = h * out_width + w;
-            vo = vld1q_f32(out_base + out_offset);
-
-            vi00 = vi0.val[0];                // [0.2.4.6]
-            vi01 = vi0.val[1];                // [1.3.5.7]
-            vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
-            vi10 = vi1.val[0];
-            vi11 = vi1.val[1];
-            vi12 = vextq_f32(vi10, vi1n, 1);
-            vi20 = vi2.val[0];
-            vi21 = vi2.val[1];
-            vi22 = vextq_f32(vi20, vi2n, 1);
-
-            // outch 0, height 0
-            vo = vmlaq_lane_f32(vo, vi00, vf01, 0);
-            vo = vmlaq_lane_f32(vo, vi01, vf01, 1);
-            vo = vmlaq_lane_f32(vo, vi02, vf23, 0);
-            vo = vmlaq_lane_f32(vo, vi10, vf23, 1);
-            vo = vmlaq_lane_f32(vo, vi11, vf45, 0);
-            vo = vmlaq_lane_f32(vo, vi12, vf45, 1);
-            vo = vmlaq_lane_f32(vo, vi20, vf67, 0);
-            vo = vmlaq_lane_f32(vo, vi21, vf67, 1);
-            vo = vmlaq_lane_f32(vo, vi22, vf78, 1);
-
-            vst1q_f32(out_base + out_offset, vo);
-          }  // w
-        }    // h
+          // load filter (1 outch x 3 height x 3 width): vf_outch_height
+          float32x2_t vf01, vf23, vf45, vf67, vf78;
+          vf01 = vld1_f32(filter_ptr);
+          vf23 = vld1_f32(filter_ptr + 2);
+          vf45 = vld1_f32(filter_ptr + 4);
+          vf67 = vld1_f32(filter_ptr + 6);
+          vf78 = vld1_f32(filter_ptr + 7);
+
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              float32x4x2_t vi0, vi1, vi2;
+              float32x4_t vi0n, vi1n, vi2n;
+
+              // input (3 height x 3 slide): vi_height_slide
+              float32x4_t vi00, vi01, vi02;
+              float32x4_t vi10, vi11, vi12;
+              float32x4_t vi20, vi21, vi22;
+
+              // output (1 outch x 1 height x 4 width): vo
+              float32x4_t vo;
+
+              // load input
+              index_t in_h = h * 2;
+              index_t in_w = w * 2;
+              index_t in_offset = in_h * in_width + in_w;
+              vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
+              vi1 = vld2q_f32(in_base + in_offset + in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+
+              vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
+              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+
+              // load ouptut
+              index_t out_offset = h * out_width + w;
+              vo = vld1q_f32(out_base + out_offset);
+
+              vi00 = vi0.val[0];                // [0.2.4.6]
+              vi01 = vi0.val[1];                // [1.3.5.7]
+              vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
+              vi10 = vi1.val[0];
+              vi11 = vi1.val[1];
+              vi12 = vextq_f32(vi10, vi1n, 1);
+              vi20 = vi2.val[0];
+              vi21 = vi2.val[1];
+              vi22 = vextq_f32(vi20, vi2n, 1);
+
+              // outch 0, height 0
+              vo = vmlaq_lane_f32(vo, vi00, vf01, 0);
+              vo = vmlaq_lane_f32(vo, vi01, vf01, 1);
+              vo = vmlaq_lane_f32(vo, vi02, vf23, 0);
+              vo = vmlaq_lane_f32(vo, vi10, vf23, 1);
+              vo = vmlaq_lane_f32(vo, vi11, vf45, 0);
+              vo = vmlaq_lane_f32(vo, vi12, vf45, 1);
+              vo = vmlaq_lane_f32(vo, vi20, vf67, 0);
+              vo = vmlaq_lane_f32(vo, vi21, vf67, 1);
+              vo = vmlaq_lane_f32(vo, vi22, vf78, 1);
+
+              vst1q_f32(out_base + out_offset, vo);
+            }  // w
+          }    // h
 #endif
-      }  // c
-    }    // m
-  }      // b
+        }  // c
+      }    // m
+    }      // b
+  }, 0, batch, 1, 0, out_channels, 1);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/fp32/conv_2d_3x3.h
index 66d47801c39fee076ca0fd0bddff806a8e30c127..bd96501d98f32ebe9ffe0bad98cccee67bc0b062 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
@@ -28,7 +28,7 @@ namespace fp32 {
 
 class Conv2dK3x3S1 : public Conv2dBase {
  public:
-  Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK3x3S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK3x3S1() {}
 
@@ -36,12 +36,12 @@ class Conv2dK3x3S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK3x3S2 : public Conv2dBase {
  public:
-  Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK3x3S2(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK3x3S2() {}
 
@@ -49,7 +49,7 @@ class Conv2dK3x3S2 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
index b894a60a964ff9b149abc5d93852f76a658b9b94..ab2517bf6295691de4ba00fd22d9e651e1e13fee 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
+
 #include <algorithm>
 
-#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
@@ -136,13 +137,15 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
     auto transformed_filter_data = transformed_filter_->mutable_data<float>();
     switch (out_tile_size) {
       case 2:
-        TransformFilter4x4(filter_data,
+        TransformFilter4x4(context,
+                           filter_data,
                            in_channels,
                            out_channels,
                            transformed_filter_data);
         break;
       case 6:
-        TransformFilter8x8(filter_data,
+        TransformFilter8x8(context,
+                           filter_data,
                            in_channels,
                            out_channels,
                            transformed_filter_data);
@@ -153,7 +156,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
 
   switch (out_tile_size) {
     case 2:
-      TransformInput4x4(padded_in_data,
+      TransformInput4x4(context,
+                        padded_in_data,
                         batch,
                         padded_in_height,
                         padded_in_width,
@@ -162,7 +166,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
                         transformed_in_data);
       break;
     case 6:
-      TransformInput8x8(padded_in_data,
+      TransformInput8x8(context,
+                        padded_in_data,
                         batch,
                         padded_in_height,
                         padded_in_width,
@@ -212,7 +217,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
 
   switch (out_tile_size) {
     case 2:
-      TransformOutput4x4(transformed_out_data,
+      TransformOutput4x4(context,
+                         transformed_out_data,
                          batch,
                          padded_out_height,
                          padded_out_width,
@@ -221,7 +227,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
                          padded_out_data);
       break;
     case 6:
-      TransformOutput8x8(transformed_out_data,
+      TransformOutput8x8(context,
+                         transformed_out_data,
                          batch,
                          padded_out_height,
                          padded_out_width,
@@ -238,72 +245,78 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
 }
 
 // OCHW => TOC
-void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
+void Conv2dK3x3Winograd::TransformFilter4x4(const OpContext *context,
+                                            const float *filter,
                                             const index_t in_channels,
                                             const index_t out_channels,
                                             float *output) {
   const index_t stride = out_channels * in_channels;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t m = 0; m < out_channels; ++m) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
-      float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-          s15;
-
-      // load filter
-      index_t filter_offset = (m * in_channels + c) * 9;
-      g0 = filter[filter_offset];
-      g1 = filter[filter_offset + 1];
-      g2 = filter[filter_offset + 2];
-      g3 = filter[filter_offset + 3];
-      g4 = filter[filter_offset + 4];
-      g5 = filter[filter_offset + 5];
-      g6 = filter[filter_offset + 6];
-      g7 = filter[filter_offset + 7];
-      g8 = filter[filter_offset + 8];
-
-      // s = G * g * GT
-      s0 = g0;
-      s1 = (g0 + g2 + g1) * 0.5f;
-      s2 = (g0 + g2 - g1) * 0.5f;
-      s3 = g2;
-      s4 = (g0 + g6 + g3) * 0.5f;
-      s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
-      s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
-      s7 = (g2 + g8 + g5) * 0.5f;
-      s8 = (g0 + g6 - g3) * 0.5f;
-      s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
-      s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
-      s11 = (g2 + g8 - g5) * 0.5f;
-      s12 = g6;
-      s13 = (g6 + g8 + g7) * 0.5f;
-      s14 = (g6 + g8 - g7) * 0.5f;
-      s15 = g8;
-
-      // store output
-      index_t output_offset = m * in_channels + c;
-      output[output_offset + 0 * stride] = s0;
-      output[output_offset + 1 * stride] = s1;
-      output[output_offset + 2 * stride] = s2;
-      output[output_offset + 3 * stride] = s3;
-
-      output[output_offset + 4 * stride] = s4;
-      output[output_offset + 5 * stride] = s5;
-      output[output_offset + 6 * stride] = s6;
-      output[output_offset + 7 * stride] = s7;
-
-      output[output_offset + 8 * stride] = s8;
-      output[output_offset + 9 * stride] = s9;
-      output[output_offset + 10 * stride] = s10;
-      output[output_offset + 11 * stride] = s11;
-
-      output[output_offset + 12 * stride] = s12;
-      output[output_offset + 13 * stride] = s13;
-      output[output_offset + 14 * stride] = s14;
-      output[output_offset + 15 * stride] = s15;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t m = start0; m < end0; m += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        float g0, g1, g2, g3, g4, g5, g6, g7, g8;
+        float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+            s15;
+
+        // load filter
+        index_t filter_offset = (m * in_channels + c) * 9;
+        g0 = filter[filter_offset];
+        g1 = filter[filter_offset + 1];
+        g2 = filter[filter_offset + 2];
+        g3 = filter[filter_offset + 3];
+        g4 = filter[filter_offset + 4];
+        g5 = filter[filter_offset + 5];
+        g6 = filter[filter_offset + 6];
+        g7 = filter[filter_offset + 7];
+        g8 = filter[filter_offset + 8];
+
+        // s = G * g * GT
+        s0 = g0;
+        s1 = (g0 + g2 + g1) * 0.5f;
+        s2 = (g0 + g2 - g1) * 0.5f;
+        s3 = g2;
+        s4 = (g0 + g6 + g3) * 0.5f;
+        s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
+        s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
+        s7 = (g2 + g8 + g5) * 0.5f;
+        s8 = (g0 + g6 - g3) * 0.5f;
+        s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
+        s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
+        s11 = (g2 + g8 - g5) * 0.5f;
+        s12 = g6;
+        s13 = (g6 + g8 + g7) * 0.5f;
+        s14 = (g6 + g8 - g7) * 0.5f;
+        s15 = g8;
+
+        // store output
+        index_t output_offset = m * in_channels + c;
+        output[output_offset + 0 * stride] = s0;
+        output[output_offset + 1 * stride] = s1;
+        output[output_offset + 2 * stride] = s2;
+        output[output_offset + 3 * stride] = s3;
+
+        output[output_offset + 4 * stride] = s4;
+        output[output_offset + 5 * stride] = s5;
+        output[output_offset + 6 * stride] = s6;
+        output[output_offset + 7 * stride] = s7;
+
+        output[output_offset + 8 * stride] = s8;
+        output[output_offset + 9 * stride] = s9;
+        output[output_offset + 10 * stride] = s10;
+        output[output_offset + 11 * stride] = s11;
+
+        output[output_offset + 12 * stride] = s12;
+        output[output_offset + 13 * stride] = s13;
+        output[output_offset + 14 * stride] = s14;
+        output[output_offset + 15 * stride] = s15;
+      }
     }
-  }
+  }, 0, out_channels, 1, 0, in_channels, 1);
 }
 
 // OCHW => TOC
@@ -325,7 +338,8 @@ void Conv2dK3x3Winograd::TransformFilter4x4(const float *filter,
 ⎢                  ⎥
 ⎣ 0      0      1  ⎦
  */
-void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter,
+void Conv2dK3x3Winograd::TransformFilter8x8(const OpContext *context,
+                                            const float *filter,
                                             const index_t in_channels,
                                             const index_t out_channels,
                                             float *output) {
@@ -340,43 +354,49 @@ void Conv2dK3x3Winograd::TransformFilter8x8(const float *filter,
                          {1.0f / 45, -1.0f / 90, 1.0f / 180},
                          {0.0f, 0.0f, 1.0f}};
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t m = 0; m < out_channels; ++m) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      // load filter
-      index_t filter_offset = (m * in_channels + c) * 9;
-      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
-      g0 = filter[filter_offset];
-      g1 = filter[filter_offset + 1];
-      g2 = filter[filter_offset + 2];
-      g3 = filter[filter_offset + 3];
-      g4 = filter[filter_offset + 4];
-      g5 = filter[filter_offset + 5];
-      g6 = filter[filter_offset + 6];
-      g7 = filter[filter_offset + 7];
-      g8 = filter[filter_offset + 8];
-
-      float s[3][8];
-      for (int i = 0; i < 8; ++i) {
-        s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
-        s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
-        s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
-      }
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t m = start0; m < end0; m += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        // load filter
+        index_t filter_offset = (m * in_channels + c) * 9;
+        float g0, g1, g2, g3, g4, g5, g6, g7, g8;
+        g0 = filter[filter_offset];
+        g1 = filter[filter_offset + 1];
+        g2 = filter[filter_offset + 2];
+        g3 = filter[filter_offset + 3];
+        g4 = filter[filter_offset + 4];
+        g5 = filter[filter_offset + 5];
+        g6 = filter[filter_offset + 6];
+        g7 = filter[filter_offset + 7];
+        g8 = filter[filter_offset + 8];
+
+        float s[3][8];
+        for (int i = 0; i < 8; ++i) {
+          s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
+          s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
+          s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
+        }
 
-      // store output
-      index_t output_offset = m * in_channels + c;
-      for (int i = 0; i < 8; ++i) {
-        for (int j = 0; j < 8; ++j) {
-          output[output_offset + (i * 8 + j) * stride] =
-              G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
+        // store output
+        index_t output_offset = m * in_channels + c;
+        for (int i = 0; i < 8; ++i) {
+          for (int j = 0; j < 8; ++j) {
+            output[output_offset + (i * 8 + j) * stride] =
+                G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
+          }
         }
       }
     }
-  }
+  }, 0, out_channels, 1, 0, in_channels, 1);
 }
 
 // NCHW => NTCB (T: in tile pixels, B: tile indices)
-void Conv2dK3x3Winograd::TransformInput4x4(const float *input,
+void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context,
+                                           const float *input,
                                            const index_t batch,
                                            const index_t in_height,
                                            const index_t in_width,
@@ -388,86 +408,93 @@ void Conv2dK3x3Winograd::TransformInput4x4(const float *input,
   const index_t input_batch_size = in_height_width * in_channels;
   const index_t output_batch_size = 16 * in_channels * tile_count;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t n = 0; n < batch; ++n) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      index_t tile_index = 0;
-      for (index_t h = 0; h < in_height - 2; h += 2) {
-        for (index_t w = 0; w < in_width - 2; w += 2) {
-          float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
-              d15;
-          float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-              s15;
-
-          // load tile data
-          const float *input_ptr = input + n * input_batch_size +
-              c * in_height_width + h * in_width + w;
-          d0 = input_ptr[0];
-          d1 = input_ptr[1];
-          d2 = input_ptr[2];
-          d3 = input_ptr[3];
-
-          d4 = input_ptr[in_width];
-          d5 = input_ptr[in_width + 1];
-          d6 = input_ptr[in_width + 2];
-          d7 = input_ptr[in_width + 3];
-
-          d8 = input_ptr[2 * in_width];
-          d9 = input_ptr[2 * in_width + 1];
-          d10 = input_ptr[2 * in_width + 2];
-          d11 = input_ptr[2 * in_width + 3];
-
-          d12 = input_ptr[3 * in_width];
-          d13 = input_ptr[3 * in_width + 1];
-          d14 = input_ptr[3 * in_width + 2];
-          d15 = input_ptr[3 * in_width + 3];
-
-          // s = BT * d * B
-          s0 = (d0 - d8) - (d2 - d10);
-          s1 = (d1 - d9) + (d2 - d10);
-          s2 = (d2 - d10) - (d1 - d9);
-          s3 = (d1 - d9) - (d3 - d11);
-          s4 = (d4 + d8) - (d6 + d10);
-          s5 = (d5 + d9) + (d6 + d10);
-          s6 = (d6 + d10) - (d5 + d9);
-          s7 = (d5 + d9) - (d7 + d11);
-          s8 = (d8 - d4) - (d10 - d6);
-          s9 = (d9 - d5) + (d10 - d6);
-          s10 = (d10 - d6) - (d9 - d5);
-          s11 = (d9 - d5) - (d11 - d7);
-          s12 = (d4 - d12) - (d6 - d14);
-          s13 = (d5 - d13) + (d6 - d14);
-          s14 = (d6 - d14) - (d5 - d13);
-          s15 = (d5 - d13) - (d7 - d15);
-
-          // store output
-          float *output_ptr =
-              output + n * output_batch_size + c * tile_count + tile_index;
-          output_ptr[0] = s0;
-          output_ptr[1 * stride] = s1;
-          output_ptr[2 * stride] = s2;
-          output_ptr[3 * stride] = s3;
-
-          output_ptr[4 * stride] = s4;
-          output_ptr[5 * stride] = s5;
-          output_ptr[6 * stride] = s6;
-          output_ptr[7 * stride] = s7;
-
-          output_ptr[8 * stride] = s8;
-          output_ptr[9 * stride] = s9;
-          output_ptr[10 * stride] = s10;
-          output_ptr[11 * stride] = s11;
-
-          output_ptr[12 * stride] = s12;
-          output_ptr[13 * stride] = s13;
-          output_ptr[14 * stride] = s14;
-          output_ptr[15 * stride] = s15;
-
-          ++tile_index;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t n = start0; n < end0; n += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        index_t tile_index = 0;
+        for (index_t h = 0; h < in_height - 2; h += 2) {
+          for (index_t w = 0; w < in_width - 2; w += 2) {
+            float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13,
+                d14,
+                d15;
+            float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                s14,
+                s15;
+
+            // load tile data
+            const float *input_ptr = input + n * input_batch_size +
+                c * in_height_width + h * in_width + w;
+            d0 = input_ptr[0];
+            d1 = input_ptr[1];
+            d2 = input_ptr[2];
+            d3 = input_ptr[3];
+
+            d4 = input_ptr[in_width];
+            d5 = input_ptr[in_width + 1];
+            d6 = input_ptr[in_width + 2];
+            d7 = input_ptr[in_width + 3];
+
+            d8 = input_ptr[2 * in_width];
+            d9 = input_ptr[2 * in_width + 1];
+            d10 = input_ptr[2 * in_width + 2];
+            d11 = input_ptr[2 * in_width + 3];
+
+            d12 = input_ptr[3 * in_width];
+            d13 = input_ptr[3 * in_width + 1];
+            d14 = input_ptr[3 * in_width + 2];
+            d15 = input_ptr[3 * in_width + 3];
+
+            // s = BT * d * B
+            s0 = (d0 - d8) - (d2 - d10);
+            s1 = (d1 - d9) + (d2 - d10);
+            s2 = (d2 - d10) - (d1 - d9);
+            s3 = (d1 - d9) - (d3 - d11);
+            s4 = (d4 + d8) - (d6 + d10);
+            s5 = (d5 + d9) + (d6 + d10);
+            s6 = (d6 + d10) - (d5 + d9);
+            s7 = (d5 + d9) - (d7 + d11);
+            s8 = (d8 - d4) - (d10 - d6);
+            s9 = (d9 - d5) + (d10 - d6);
+            s10 = (d10 - d6) - (d9 - d5);
+            s11 = (d9 - d5) - (d11 - d7);
+            s12 = (d4 - d12) - (d6 - d14);
+            s13 = (d5 - d13) + (d6 - d14);
+            s14 = (d6 - d14) - (d5 - d13);
+            s15 = (d5 - d13) - (d7 - d15);
+
+            // store output
+            float *output_ptr =
+                output + n * output_batch_size + c * tile_count + tile_index;
+            output_ptr[0] = s0;
+            output_ptr[1 * stride] = s1;
+            output_ptr[2 * stride] = s2;
+            output_ptr[3 * stride] = s3;
+
+            output_ptr[4 * stride] = s4;
+            output_ptr[5 * stride] = s5;
+            output_ptr[6 * stride] = s6;
+            output_ptr[7 * stride] = s7;
+
+            output_ptr[8 * stride] = s8;
+            output_ptr[9 * stride] = s9;
+            output_ptr[10 * stride] = s10;
+            output_ptr[11 * stride] = s11;
+
+            output_ptr[12 * stride] = s12;
+            output_ptr[13 * stride] = s13;
+            output_ptr[14 * stride] = s14;
+            output_ptr[15 * stride] = s15;
+
+            ++tile_index;
+          }
         }
       }
     }
-  }
+  }, 0, batch, 1, 0, in_channels, 1);
 }
 
 // NCHW => NTCB (T: in tile pixels, B: tile indices)
@@ -489,7 +516,8 @@ void Conv2dK3x3Winograd::TransformInput4x4(const float *input,
 ⎢                                          ⎥
 ⎣0   -1     0    21/4     0    -21/4  0   1⎦
  */
-void Conv2dK3x3Winograd::TransformInput8x8(const float *input,
+void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context,
+                                           const float *input,
                                            const index_t batch,
                                            const index_t in_height,
                                            const index_t in_width,
@@ -501,89 +529,94 @@ void Conv2dK3x3Winograd::TransformInput8x8(const float *input,
   const index_t input_batch_size = in_height_width * in_channels;
   const index_t output_batch_size = 64 * in_channels * tile_count;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t n = 0; n < batch; ++n) {
-    for (index_t c = 0; c < in_channels; ++c) {
-      index_t tile_index = 0;
-      float s[8][8];
-      for (index_t h = 0; h < in_height - 2; h += 6) {
-        for (index_t w = 0; w < in_width - 2; w += 6) {
-          const float *input_ptr = input + n * input_batch_size +
-              c * in_height_width + h * in_width + w;
-
-          for (int i = 0; i < 8; ++i) {
-            float d0, d1, d2, d3, d4, d5, d6, d7;
-            d0 = input_ptr[0];
-            d1 = input_ptr[1];
-            d2 = input_ptr[2];
-            d3 = input_ptr[3];
-            d4 = input_ptr[4];
-            d5 = input_ptr[5];
-            d6 = input_ptr[6];
-            d7 = input_ptr[7];
-
-            s[i][0] = d0 - d6 + (d4 - d2) * 5.25;
-            s[i][7] = d7 - d1 + (d3 - d5) * 5.25;
-
-            float u = d2 + d6 - d4 * 4.25;
-            float v = d1 + d5 - d3 * 4.25;
-            s[i][1] = u + v;
-            s[i][2] = u - v;
-
-            u = d6 + d2 * 0.25 - d4 * 1.25;
-            v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
-            s[i][3] = u + v;
-            s[i][4] = u - v;
-
-            u = d6 + (d2 - d4 * 1.25) * 4;
-            v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
-            s[i][5] = u + v;
-            s[i][6] = u - v;
-
-            input_ptr += in_width;
-          }
-
-          float *output_ptr =
-              output + n * output_batch_size + c * tile_count + tile_index;
-          for (int i = 0; i < 8; ++i) {
-            float d0, d1, d2, d3, d4, d5, d6, d7;
-            d0 = s[0][i];
-            d1 = s[1][i];
-            d2 = s[2][i];
-            d3 = s[3][i];
-            d4 = s[4][i];
-            d5 = s[5][i];
-            d6 = s[6][i];
-            d7 = s[7][i];
-
-            output_ptr[i * stride] = d0 - d6 + (d4 - d2) * 5.25;
-            output_ptr[(56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25;
-
-            float u = d2 + d6 - d4 * 4.25;
-            float v = d1 + d5 - d3 * 4.25;
-            output_ptr[(8 + i) * stride] = u + v;
-            output_ptr[(16 + i) * stride] = u - v;
-
-            u = d6 + d2 * 0.25 - d4 * 1.25;
-            v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
-            output_ptr[(24 + i) * stride] = u + v;
-            output_ptr[(32 + i) * stride] = u - v;
-
-            u = d6 + (d2 - d4 * 1.25) * 4;
-            v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
-            output_ptr[(40 + i) * stride] = u + v;
-            output_ptr[(48 + i) * stride] = u - v;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t n = start0; n < end0; n += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        index_t tile_index = 0;
+        float s[8][8];
+        for (index_t h = 0; h < in_height - 2; h += 6) {
+          for (index_t w = 0; w < in_width - 2; w += 6) {
+            const float *input_ptr = input + n * input_batch_size +
+                c * in_height_width + h * in_width + w;
+
+            for (int i = 0; i < 8; ++i) {
+              float d0, d1, d2, d3, d4, d5, d6, d7;
+              d0 = input_ptr[0];
+              d1 = input_ptr[1];
+              d2 = input_ptr[2];
+              d3 = input_ptr[3];
+              d4 = input_ptr[4];
+              d5 = input_ptr[5];
+              d6 = input_ptr[6];
+              d7 = input_ptr[7];
+
+              s[i][0] = d0 - d6 + (d4 - d2) * 5.25;
+              s[i][7] = d7 - d1 + (d3 - d5) * 5.25;
+
+              float u = d2 + d6 - d4 * 4.25;
+              float v = d1 + d5 - d3 * 4.25;
+              s[i][1] = u + v;
+              s[i][2] = u - v;
+
+              u = d6 + d2 * 0.25 - d4 * 1.25;
+              v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+              s[i][3] = u + v;
+              s[i][4] = u - v;
+
+              u = d6 + (d2 - d4 * 1.25) * 4;
+              v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+              s[i][5] = u + v;
+              s[i][6] = u - v;
+
+              input_ptr += in_width;
+            }
+
+            float *output_ptr =
+                output + n * output_batch_size + c * tile_count + tile_index;
+            for (int i = 0; i < 8; ++i) {
+              float d0, d1, d2, d3, d4, d5, d6, d7;
+              d0 = s[0][i];
+              d1 = s[1][i];
+              d2 = s[2][i];
+              d3 = s[3][i];
+              d4 = s[4][i];
+              d5 = s[5][i];
+              d6 = s[6][i];
+              d7 = s[7][i];
+
+              output_ptr[i * stride] = d0 - d6 + (d4 - d2) * 5.25;
+              output_ptr[(56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25;
+
+              float u = d2 + d6 - d4 * 4.25;
+              float v = d1 + d5 - d3 * 4.25;
+              output_ptr[(8 + i) * stride] = u + v;
+              output_ptr[(16 + i) * stride] = u - v;
+
+              u = d6 + d2 * 0.25 - d4 * 1.25;
+              v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+              output_ptr[(24 + i) * stride] = u + v;
+              output_ptr[(32 + i) * stride] = u - v;
+
+              u = d6 + (d2 - d4 * 1.25) * 4;
+              v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+              output_ptr[(40 + i) * stride] = u + v;
+              output_ptr[(48 + i) * stride] = u - v;
+            }
+
+            ++tile_index;
           }
-
-          ++tile_index;
         }
       }
     }
-  }
+  }, 0, batch, 1, 0, in_channels, 1);
 }
 
 // NTOB => NToOB => NOHoWo
-void Conv2dK3x3Winograd::TransformOutput4x4(const float *input,
+void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context,
+                                            const float *input,
                                             index_t batch,
                                             index_t out_height,
                                             index_t out_width,
@@ -595,65 +628,70 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const float *input,
   const index_t out_image_size = out_height * out_width;
   const index_t output_batch_size = out_channels * out_image_size;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t n = 0; n < batch; ++n) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      index_t tile_offset = 0;
-      for (index_t h = 0; h < out_height; h += 2) {
-        for (index_t w = 0; w < out_width; w += 2) {
-          float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
-              d15;
-          float s0, s1, s2, s3, s4, s5, s6, s7;
-          float v0, v1, v2, v3;
-
-          const float *input_ptr =
-              input + n * input_batch_size + m * tile_count + tile_offset;
-          d0 = input_ptr[0];
-          d1 = input_ptr[1 * stride];
-          d2 = input_ptr[2 * stride];
-          d3 = input_ptr[3 * stride];
-
-          d4 = input_ptr[4 * stride];
-          d5 = input_ptr[5 * stride];
-          d6 = input_ptr[6 * stride];
-          d7 = input_ptr[7 * stride];
-
-          d8 = input_ptr[8 * stride];
-          d9 = input_ptr[9 * stride];
-          d10 = input_ptr[10 * stride];
-          d11 = input_ptr[11 * stride];
-
-          d12 = input_ptr[12 * stride];
-          d13 = input_ptr[13 * stride];
-          d14 = input_ptr[14 * stride];
-          d15 = input_ptr[15 * stride];
-
-          s0 = d0 + d1 + d2;
-          s1 = d1 - d2 - d3;
-          s2 = d4 + d5 + d6;
-          s3 = d5 - d6 - d7;
-          s4 = d8 + d9 + d10;
-          s5 = d9 - d10 - d11;
-          s6 = d12 + d13 + d14;
-          s7 = d13 - d14 - d15;
-
-          v0 = s0 + s2 + s4;
-          v1 = s1 + s3 + s5;
-          v2 = s2 - s4 - s6;
-          v3 = s3 - s5 - s7;
-
-          float *output_ptr = output + n * output_batch_size +
-              m * out_image_size + h * out_width + w;
-          output_ptr[0] = v0;
-          output_ptr[1] = v1;
-          output_ptr[out_width] = v2;
-          output_ptr[out_width + 1] = v3;
-
-          ++tile_offset;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t n = start0; n < end0; n += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        index_t tile_offset = 0;
+        for (index_t h = 0; h < out_height; h += 2) {
+          for (index_t w = 0; w < out_width; w += 2) {
+            float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13,
+                d14,
+                d15;
+            float s0, s1, s2, s3, s4, s5, s6, s7;
+            float v0, v1, v2, v3;
+
+            const float *input_ptr =
+                input + n * input_batch_size + m * tile_count + tile_offset;
+            d0 = input_ptr[0];
+            d1 = input_ptr[1 * stride];
+            d2 = input_ptr[2 * stride];
+            d3 = input_ptr[3 * stride];
+
+            d4 = input_ptr[4 * stride];
+            d5 = input_ptr[5 * stride];
+            d6 = input_ptr[6 * stride];
+            d7 = input_ptr[7 * stride];
+
+            d8 = input_ptr[8 * stride];
+            d9 = input_ptr[9 * stride];
+            d10 = input_ptr[10 * stride];
+            d11 = input_ptr[11 * stride];
+
+            d12 = input_ptr[12 * stride];
+            d13 = input_ptr[13 * stride];
+            d14 = input_ptr[14 * stride];
+            d15 = input_ptr[15 * stride];
+
+            s0 = d0 + d1 + d2;
+            s1 = d1 - d2 - d3;
+            s2 = d4 + d5 + d6;
+            s3 = d5 - d6 - d7;
+            s4 = d8 + d9 + d10;
+            s5 = d9 - d10 - d11;
+            s6 = d12 + d13 + d14;
+            s7 = d13 - d14 - d15;
+
+            v0 = s0 + s2 + s4;
+            v1 = s1 + s3 + s5;
+            v2 = s2 - s4 - s6;
+            v3 = s3 - s5 - s7;
+
+            float *output_ptr = output + n * output_batch_size +
+                m * out_image_size + h * out_width + w;
+            output_ptr[0] = v0;
+            output_ptr[1] = v1;
+            output_ptr[out_width] = v2;
+            output_ptr[out_width + 1] = v3;
+
+            ++tile_offset;
+          }
         }
       }
     }
-  }
+  }, 0, batch, 1, 0, out_channels, 1);
 }
 
 // NTOB => NToOB => NOHoWo
@@ -671,7 +709,8 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const float *input,
 ⎢                             ⎥
 ⎣0  1  -1  32  -32  1   -1   1⎦
  */
-void Conv2dK3x3Winograd::TransformOutput8x8(const float *input,
+void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
+                                            const float *input,
                                             index_t batch,
                                             index_t out_height,
                                             index_t out_width,
@@ -683,78 +722,82 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const float *input,
   const index_t out_image_size = out_height * out_width;
   const index_t output_batch_size = out_channels * out_image_size;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t n = 0; n < batch; ++n) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      index_t tile_offset = 0;
-      float s[8][6];
-      for (index_t h = 0; h < out_height; h += 6) {
-        for (index_t w = 0; w < out_width; w += 6) {
-          const float *input_ptr =
-              input + n * input_batch_size + m * tile_count + tile_offset;
-          for (int i = 0; i < 8; ++i) {
-            float d0, d1, d2, d3, d4, d5, d6, d7;
-
-            d0 = input_ptr[0];
-            d1 = input_ptr[1 * stride];
-            d2 = input_ptr[2 * stride];
-            d3 = input_ptr[3 * stride];
-            d4 = input_ptr[4 * stride];
-            d5 = input_ptr[5 * stride];
-            d6 = input_ptr[6 * stride];
-            d7 = input_ptr[7 * stride];
-
-            float u = d1 + d2;
-            float v = d1 - d2;
-            float w = d3 + d4;
-            float x = d3 - d4;
-            float y = d5 + d6;
-            float z = d5 - d6;
-
-            s[i][0] = d0 + u + w + y * 32;
-            s[i][1] = v + x + x + z * 16;
-            s[i][2] = u + w * 4 + y * 8;
-            s[i][3] = v + x * 8 + z * 4;
-            s[i][4] = u + w * 16 + y + y;
-            s[i][5] = v + x * 32 + z + d7;
-
-            input_ptr += 8 * stride;
-          }
-
-          float *output_ptr = output + n * output_batch_size +
-              m * out_image_size + h * out_width + w;
-
-          for (int i = 0; i < 6; ++i) {
-            float d0, d1, d2, d3, d4, d5, d6, d7;
-            d0 = s[0][i];
-            d1 = s[1][i];
-            d2 = s[2][i];
-            d3 = s[3][i];
-            d4 = s[4][i];
-            d5 = s[5][i];
-            d6 = s[6][i];
-            d7 = s[7][i];
-
-            float u = d1 + d2;
-            float v = d1 - d2;
-            float w = d3 + d4;
-            float x = d3 - d4;
-            float y = d5 + d6;
-            float z = d5 - d6;
-
-            output_ptr[i] = d0 + u + w + y * 32;
-            output_ptr[1 * out_width + i] = v + x + x + z * 16;
-            output_ptr[2 * out_width + i] = u + w * 4 + y * 8;
-            output_ptr[3 * out_width + i] = v + x * 8 + z * 4;
-            output_ptr[4 * out_width + i] = u + w * 16 + y + y;
-            output_ptr[5 * out_width + i] = v + x * 32 + z + d7;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t n = start0; n < end0; n += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        index_t tile_offset = 0;
+        float s[8][6];
+        for (index_t h = 0; h < out_height; h += 6) {
+          for (index_t w = 0; w < out_width; w += 6) {
+            const float *input_ptr =
+                input + n * input_batch_size + m * tile_count + tile_offset;
+            for (int i = 0; i < 8; ++i) {
+              float d0, d1, d2, d3, d4, d5, d6, d7;
+
+              d0 = input_ptr[0];
+              d1 = input_ptr[1 * stride];
+              d2 = input_ptr[2 * stride];
+              d3 = input_ptr[3 * stride];
+              d4 = input_ptr[4 * stride];
+              d5 = input_ptr[5 * stride];
+              d6 = input_ptr[6 * stride];
+              d7 = input_ptr[7 * stride];
+
+              float u = d1 + d2;
+              float v = d1 - d2;
+              float w = d3 + d4;
+              float x = d3 - d4;
+              float y = d5 + d6;
+              float z = d5 - d6;
+
+              s[i][0] = d0 + u + w + y * 32;
+              s[i][1] = v + x + x + z * 16;
+              s[i][2] = u + w * 4 + y * 8;
+              s[i][3] = v + x * 8 + z * 4;
+              s[i][4] = u + w * 16 + y + y;
+              s[i][5] = v + x * 32 + z + d7;
+
+              input_ptr += 8 * stride;
+            }
+
+            float *output_ptr = output + n * output_batch_size +
+                m * out_image_size + h * out_width + w;
+
+            for (int i = 0; i < 6; ++i) {
+              float d0, d1, d2, d3, d4, d5, d6, d7;
+              d0 = s[0][i];
+              d1 = s[1][i];
+              d2 = s[2][i];
+              d3 = s[3][i];
+              d4 = s[4][i];
+              d5 = s[5][i];
+              d6 = s[6][i];
+              d7 = s[7][i];
+
+              float u = d1 + d2;
+              float v = d1 - d2;
+              float w = d3 + d4;
+              float x = d3 - d4;
+              float y = d5 + d6;
+              float z = d5 - d6;
+
+              output_ptr[i] = d0 + u + w + y * 32;
+              output_ptr[1 * out_width + i] = v + x + x + z * 16;
+              output_ptr[2 * out_width + i] = u + w * 4 + y * 8;
+              output_ptr[3 * out_width + i] = v + x * 8 + z * 4;
+              output_ptr[4 * out_width + i] = u + w * 16 + y + y;
+              output_ptr[5 * out_width + i] = v + x * 32 + z + d7;
+            }
+
+            ++tile_offset;
           }
-
-          ++tile_offset;
         }
       }
     }
-  }
+  }, 0, batch, 1, 0, out_channels, 1);
 }
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
index 3ed8646b17c12424a884611ac22698c6d3a9bf05..53118a6aea3b2d8d3a75b08fa5d0b0f84ef69203 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -31,7 +31,7 @@ namespace fp32 {
 
 class Conv2dK3x3Winograd : public Conv2dBase {
  public:
-  Conv2dK3x3Winograd(const std::vector<int> paddings,
+  Conv2dK3x3Winograd(const std::vector<int> &paddings,
                      const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type),
         gemm_(),
@@ -44,20 +44,23 @@ class Conv2dK3x3Winograd : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 
  private:
-  void TransformFilter4x4(const float *filter,
+  void TransformFilter4x4(const OpContext *context,
+                          const float *filter,
                           const index_t in_channels,
                           const index_t out_channels,
                           float *output);
 
-  void TransformFilter8x8(const float *filter,
+  void TransformFilter8x8(const OpContext *context,
+                          const float *filter,
                           const index_t in_channels,
                           const index_t out_channels,
                           float *output);
 
-  void TransformInput4x4(const float *input,
+  void TransformInput4x4(const OpContext *context,
+                         const float *input,
                          const index_t batch,
                          const index_t in_height,
                          const index_t in_width,
@@ -65,7 +68,8 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                          const index_t tile_count,
                          float *output);
 
-  void TransformInput8x8(const float *input,
+  void TransformInput8x8(const OpContext *context,
+                         const float *input,
                          const index_t batch,
                          const index_t in_height,
                          const index_t in_width,
@@ -73,7 +77,8 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                          const index_t tile_count,
                          float *output);
 
-  void TransformOutput4x4(const float *input,
+  void TransformOutput4x4(const OpContext *context,
+                          const float *input,
                           index_t batch,
                           index_t out_height,
                           index_t out_width,
@@ -81,7 +86,8 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                           index_t tile_count,
                           float *output);
 
-  void TransformOutput8x8(const float *input,
+  void TransformOutput8x8(const OpContext *context,
+                          const float *input,
                           index_t batch,
                           index_t out_height,
                           index_t out_width,
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc
index 264e48fa13f91756c47fae6f5b9db9ed7f2cc57c..1b41ec7ccd87a14e5683e1f84bc6f967e159b5b3 100644
--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d_5x5.h"
+
 #include <arm_neon.h>
 #include <memory>
-#include "mace/ops/arm/fp32/conv_2d_5x5.h"
 
 namespace mace {
 namespace ops {
@@ -91,11 +92,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -107,104 +108,62 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float
-              *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
-          const float *filter_ptr1 =
-              filter_data + (m + 1) * in_channels * 25 + c * 25;
-          const float *filter_ptr2 =
-              filter_data + (m + 2) * in_channels * 25 + c * 25;
-          const float *filter_ptr3 =
-              filter_data + (m + 3) * in_channels * 25 + c * 25;
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-              for (index_t r = 0; r < 5; ++r) {
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-
-                MACE_Conv2dNeonK5x5SnLoadCalc4;
-
-                in_offset += in_width;
-                filter_ptr0 += 5;
-                filter_ptr1 += 5;
-                filter_ptr2 += 5;
-                filter_ptr3 += 5;
-              }  // r
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-
-              filter_ptr0 -= 25;
-              filter_ptr1 -= 25;
-              filter_ptr2 -= 25;
-              filter_ptr3 -= 25;
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
+
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
+                *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * in_channels * 25 + c * 25;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * in_channels * 25 + c * 25;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * in_channels * 25 + c * 25;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
                 index_t in_offset = h * in_width + w;
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0, vo1, vo2, vo3;
                 // load output
                 index_t out_offset = h * out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                vo1 = vld1q_f32(out_ptr1_base + out_offset);
+                vo2 = vld1q_f32(out_ptr2_base + out_offset);
+                vo3 = vld1q_f32(out_ptr3_base + out_offset);
                 for (index_t r = 0; r < 5; ++r) {
                   // input (3 slide)
                   float32x4_t vi0, vi1, vi2, vi3, vi4;
@@ -215,21 +174,71 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
                   vi2 = vextq_f32(vi0, vi4, 2);
                   vi3 = vextq_f32(vi0, vi4, 3);
 
-                  MACE_Conv2dNeonK5x5SnLoadCalc1;
+                  MACE_Conv2dNeonK5x5SnLoadCalc4;
 
                   in_offset += in_width;
                   filter_ptr0 += 5;
+                  filter_ptr1 += 5;
+                  filter_ptr2 += 5;
+                  filter_ptr3 += 5;
                 }  // r
 
                 vst1q_f32(out_ptr0_base + out_offset, vo0);
+                vst1q_f32(out_ptr1_base + out_offset, vo1);
+                vst1q_f32(out_ptr2_base + out_offset, vo2);
+                vst1q_f32(out_ptr3_base + out_offset, vo3);
+
                 filter_ptr0 -= 25;
+                filter_ptr1 -= 25;
+                filter_ptr2 -= 25;
+                filter_ptr3 -= 25;
               }  // w
             }    // h
           }  // c
-        }    // mm
-      }      // if
-    }        // m
-  }          // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t in_offset = h * in_width + w;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float32x4_t vo0;
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                  for (index_t r = 0; r < 5; ++r) {
+                    // input (3 slide)
+                    float32x4_t vi0, vi1, vi2, vi3, vi4;
+                    // load input
+                    vi0 = vld1q_f32(in_ptr_base + in_offset);
+                    vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                    vi1 = vextq_f32(vi0, vi4, 1);
+                    vi2 = vextq_f32(vi0, vi4, 2);
+                    vi3 = vextq_f32(vi0, vi4, 3);
+
+                    MACE_Conv2dNeonK5x5SnLoadCalc1;
+
+                    in_offset += in_width;
+                    filter_ptr0 += 5;
+                  }  // r
+
+                  vst1q_f32(out_ptr0_base + out_offset, vo0);
+                  filter_ptr0 -= 25;
+                }  // w
+              }    // h
+            }  // c
+          }    // mm
+        }      // if
+      }        // m
+    }          // b
+  }, 0, batch, 1, 0, out_channels, 4);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.h b/mace/ops/arm/fp32/conv_2d_5x5.h
index 154d74a849f38c5b114f70d897946a220a722d2c..b6fdf9bbda9d7edc7593a08e30ce6f30987de2a4 100644
--- a/mace/ops/arm/fp32/conv_2d_5x5.h
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
@@ -28,7 +28,7 @@ namespace fp32 {
 
 class Conv2dK5x5S1 : public Conv2dBase {
  public:
-  Conv2dK5x5S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK5x5S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK5x5S1() {}
 
@@ -36,7 +36,7 @@ class Conv2dK5x5S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc
index 86d3e468f494bb42e3f5c3ecaf608adca72cea5a..4ee8a045a8c61e72fb615816af0fc9c52b77f9b9 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/ops/arm/fp32/conv_2d_7x7.h"
+
 #include <arm_neon.h>
 #include <memory>
-#include "mace/ops/arm/fp32/conv_2d_7x7.h"
 
 namespace mace {
 namespace ops {
@@ -168,11 +169,11 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -184,111 +185,61 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float
-              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
-          const float *filter_ptr1 =
-              filter_data + (m + 1) * in_channels * 49 + c * 49;
-          const float *filter_ptr2 =
-              filter_data + (m + 2) * in_channels * 49 + c * 49;
-          const float *filter_ptr3 =
-              filter_data + (m + 3) * in_channels * 49 + c * 49;
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-              for (index_t r = 0; r < 7; ++r) {
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
-                float32x4_t vi8;  // for tmp use
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-
-#if defined(__aarch64__)
-                MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
-#else
-                MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
-#endif
-
-                in_offset += in_width;
-                filter_ptr0 += 7;
-                filter_ptr1 += 7;
-                filter_ptr2 += 7;
-                filter_ptr3 += 7;
-              }  // r
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-
-              filter_ptr0 -= 49;
-              filter_ptr1 -= 49;
-              filter_ptr2 -= 49;
-              filter_ptr3 -= 49;
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * in_channels * 49 + c * 49;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * in_channels * 49 + c * 49;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
                 index_t in_offset = h * in_width + w;
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0, vo1, vo2, vo3;
                 // load output
                 index_t out_offset = h * out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                vo1 = vld1q_f32(out_ptr1_base + out_offset);
+                vo2 = vld1q_f32(out_ptr2_base + out_offset);
+                vo3 = vld1q_f32(out_ptr3_base + out_offset);
                 for (index_t r = 0; r < 7; ++r) {
                   // input (3 slide)
                   float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
@@ -304,24 +255,82 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                   vi6 = vextq_f32(vi4, vi8, 2);
 
 #if defined(__aarch64__)
-                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
 #else
-                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
                   in_offset += in_width;
                   filter_ptr0 += 7;
+                  filter_ptr1 += 7;
+                  filter_ptr2 += 7;
+                  filter_ptr3 += 7;
                 }  // r
 
                 vst1q_f32(out_ptr0_base + out_offset, vo0);
+                vst1q_f32(out_ptr1_base + out_offset, vo1);
+                vst1q_f32(out_ptr2_base + out_offset, vo2);
+                vst1q_f32(out_ptr3_base + out_offset, vo3);
+
                 filter_ptr0 -= 49;
+                filter_ptr1 -= 49;
+                filter_ptr2 -= 49;
+                filter_ptr3 -= 49;
               }  // w
             }    // h
           }  // c
-        }    // mm
-      }      // if
-    }        // m
-  }          // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t in_offset = h * in_width + w;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float32x4_t vo0;
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                  for (index_t r = 0; r < 7; ++r) {
+                    // input (3 slide)
+                    float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
+                    float32x4_t vi8;  // for tmp use
+                    // load input
+                    vi0 = vld1q_f32(in_ptr_base + in_offset);
+                    vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+                    vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+                    vi1 = vextq_f32(vi0, vi4, 1);
+                    vi2 = vextq_f32(vi0, vi4, 2);
+                    vi3 = vextq_f32(vi0, vi4, 3);
+                    vi5 = vextq_f32(vi4, vi8, 1);
+                    vi6 = vextq_f32(vi4, vi8, 2);
+
+#if defined(__aarch64__)
+                    MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+#else
+                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+#endif
+
+                    in_offset += in_width;
+                    filter_ptr0 += 7;
+                  }  // r
+
+                  vst1q_f32(out_ptr0_base + out_offset, vo0);
+                  filter_ptr0 -= 49;
+                }  // w
+              }    // h
+            }  // c
+          }    // mm
+        }      // if
+      }        // m
+    }          // b
+  }, 0, batch, 1, 0, out_channels, 4);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
@@ -342,11 +351,11 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -358,118 +367,63 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float
-              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
-          const float *filter_ptr1 =
-              filter_data + (m + 1) * in_channels * 49 + c * 49;
-          const float *filter_ptr2 =
-              filter_data + (m + 2) * in_channels * 49 + c * 49;
-          const float *filter_ptr3 =
-              filter_data + (m + 3) * in_channels * 49 + c * 49;
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input offset
-              index_t in_h = h * 2;
-              index_t in_w = w * 2;
-              index_t in_offset = in_h * in_width + in_w;
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-              for (index_t r = 0; r < 7; ++r) {
-                // input (3 slide)
-                float32x4x2_t vvi0, vvi1;  // to de-interleave
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
-                // load input
-                // [0.2.4.6, 1.3.5.7]
-                vvi0 = vld2q_f32(in_ptr_base + in_offset);
-                // [8.10.12.14, 9.11.13.15]
-                vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
-                vi0 = vvi0.val[0];                     // [0.2.4.6]
-                vi1 = vvi0.val[1];                     // [1.3.5.7]
-                vi2 = vextq_f32(vi0, vvi1.val[0], 1);  // [2.4.6.8]
-                vi3 = vextq_f32(vi1, vvi1.val[1], 1);  // [3.5.7.9]
-                vi4 = vextq_f32(vi0, vvi1.val[0], 2);  // [4.6.8.10]
-                vi5 = vextq_f32(vi1, vvi1.val[1], 2);  // [5.7.9.11]
-                vi6 = vextq_f32(vi0, vvi1.val[0], 3);  // [6.8.10.12]
-
-#if defined(__aarch64__)
-                MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
-#else
-                MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
-#endif
-
-                in_offset += in_width;
-                filter_ptr0 += 7;
-                filter_ptr1 += 7;
-                filter_ptr2 += 7;
-                filter_ptr3 += 7;
-              }  // r
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-
-              filter_ptr0 -= 49;
-              filter_ptr1 -= 49;
-              filter_ptr2 -= 49;
-              filter_ptr3 -= 49;
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * in_channels * 49 + c * 49;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * in_channels * 49 + c * 49;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
                 index_t in_h = h * 2;
                 index_t in_w = w * 2;
                 index_t in_offset = in_h * in_width + in_w;
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load ouput
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0, vo1, vo2, vo3;
+                // load output
                 index_t out_offset = h * out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                vo1 = vld1q_f32(out_ptr1_base + out_offset);
+                vo2 = vld1q_f32(out_ptr2_base + out_offset);
+                vo3 = vld1q_f32(out_ptr3_base + out_offset);
                 for (index_t r = 0; r < 7; ++r) {
                   // input (3 slide)
                   float32x4x2_t vvi0, vvi1;  // to de-interleave
@@ -488,24 +442,87 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                   vi6 = vextq_f32(vi0, vvi1.val[0], 3);  // [6.8.10.12]
 
 #if defined(__aarch64__)
-                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
 #else
-                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
                   in_offset += in_width;
                   filter_ptr0 += 7;
+                  filter_ptr1 += 7;
+                  filter_ptr2 += 7;
+                  filter_ptr3 += 7;
                 }  // r
 
                 vst1q_f32(out_ptr0_base + out_offset, vo0);
+                vst1q_f32(out_ptr1_base + out_offset, vo1);
+                vst1q_f32(out_ptr2_base + out_offset, vo2);
+                vst1q_f32(out_ptr3_base + out_offset, vo3);
+
                 filter_ptr0 -= 49;
+                filter_ptr1 -= 49;
+                filter_ptr2 -= 49;
+                filter_ptr3 -= 49;
               }  // w
             }    // h
           }  // c
-        }    // mm
-      }      // if
-    }        // m
-  }          // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t in_h = h * 2;
+                  index_t in_w = w * 2;
+                  index_t in_offset = in_h * in_width + in_w;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float32x4_t vo0;
+                  // load ouput
+                  index_t out_offset = h * out_width + w;
+                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                  for (index_t r = 0; r < 7; ++r) {
+                    // input (3 slide)
+                    float32x4x2_t vvi0, vvi1;  // to de-interleave
+                    float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
+                    // load input
+                    // [0.2.4.6, 1.3.5.7]
+                    vvi0 = vld2q_f32(in_ptr_base + in_offset);
+                    // [8.10.12.14, 9.11.13.15]
+                    vvi1 = vld2q_f32(in_ptr_base + in_offset + 8);
+                    vi0 = vvi0.val[0];                     // [0.2.4.6]
+                    vi1 = vvi0.val[1];                     // [1.3.5.7]
+                    vi2 = vextq_f32(vi0, vvi1.val[0], 1);  // [2.4.6.8]
+                    vi3 = vextq_f32(vi1, vvi1.val[1], 1);  // [3.5.7.9]
+                    vi4 = vextq_f32(vi0, vvi1.val[0], 2);  // [4.6.8.10]
+                    vi5 = vextq_f32(vi1, vvi1.val[1], 2);  // [5.7.9.11]
+                    vi6 = vextq_f32(vi0, vvi1.val[0], 3);  // [6.8.10.12]
+
+#if defined(__aarch64__)
+                    MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+#else
+                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+#endif
+
+                    in_offset += in_width;
+                    filter_ptr0 += 7;
+                  }  // r
+
+                  vst1q_f32(out_ptr0_base + out_offset, vo0);
+                  filter_ptr0 -= 49;
+                }  // w
+              }    // h
+            }  // c
+          }    // mm
+        }      // if
+      }        // m
+    }          // b
+  }, 0, batch, 1, 0, out_channels, 4);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
@@ -526,11 +543,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                        &padded_input,
                        &padded_output);
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -542,118 +559,63 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base =
-            output_data + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output_data + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output_data + b * out_batch_size + (m + 3) * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float
-              *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
-          const float *filter_ptr1 =
-              filter_data + (m + 1) * in_channels * 49 + c * 49;
-          const float *filter_ptr2 =
-              filter_data + (m + 2) * in_channels * 49 + c * 49;
-          const float *filter_ptr3 =
-              filter_data + (m + 3) * in_channels * 49 + c * 49;
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input offset
-              index_t in_h = h * 3;
-              index_t in_w = w * 3;
-              index_t in_offset = in_h * in_width + in_w;
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-              for (index_t r = 0; r < 7; ++r) {
-                // input (3 slide)
-                float32x4x3_t vvi0, vvi1;  // to de-interleave
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
-                // load input
-                // [0.3.6.9, 1.4.7.10, 2.5.8.11]
-                vvi0 = vld3q_f32(in_ptr_base + in_offset);
-                // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
-                vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
-                vi0 = vvi0.val[0];                     // [0.3.6.9]
-                vi1 = vvi0.val[1];                     // [1.4.7.10]
-                vi2 = vvi0.val[2];                     // [2.5.8.11]
-                vi3 = vextq_f32(vi0, vvi1.val[0], 1);  // [3.6.9.12]
-                vi4 = vextq_f32(vi1, vvi1.val[1], 1);  // [4.7.10.13]
-                vi5 = vextq_f32(vi2, vvi1.val[2], 1);  // [5.8.11.14]
-                vi6 = vextq_f32(vi0, vvi1.val[0], 2);  // [6.9.12.15]
-
-#if defined(__aarch64__)
-                MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
-#else
-                MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
-#endif
-
-                in_offset += in_width;
-                filter_ptr0 += 7;
-                filter_ptr1 += 7;
-                filter_ptr2 += 7;
-                filter_ptr3 += 7;
-              }  // r
-
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-
-              filter_ptr0 -= 49;
-              filter_ptr1 -= 49;
-              filter_ptr2 -= 49;
-              filter_ptr3 -= 49;
-            }  // w
-          }    // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base =
+              output_data + b * out_batch_size + (m + 1) * out_image_size;
+          float *out_ptr2_base =
+              output_data + b * out_batch_size + (m + 2) * out_image_size;
+          float *out_ptr3_base =
+              output_data + b * out_batch_size + (m + 3) * out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float
-                *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * in_channels * 49 + c * 49;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * in_channels * 49 + c * 49;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * in_channels * 49 + c * 49;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
                 index_t in_h = h * 3;
                 index_t in_w = w * 3;
                 index_t in_offset = in_h * in_width + in_w;
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float32x4_t vo0, vo1, vo2, vo3;
                 // load output
                 index_t out_offset = h * out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                vo1 = vld1q_f32(out_ptr1_base + out_offset);
+                vo2 = vld1q_f32(out_ptr2_base + out_offset);
+                vo3 = vld1q_f32(out_ptr3_base + out_offset);
                 for (index_t r = 0; r < 7; ++r) {
                   // input (3 slide)
                   float32x4x3_t vvi0, vvi1;  // to de-interleave
@@ -672,24 +634,87 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                   vi6 = vextq_f32(vi0, vvi1.val[0], 2);  // [6.9.12.15]
 
 #if defined(__aarch64__)
-                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv8NeonK7x7SnLoadCalc4;
 #else
-                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
                   in_offset += in_width;
                   filter_ptr0 += 7;
+                  filter_ptr1 += 7;
+                  filter_ptr2 += 7;
+                  filter_ptr3 += 7;
                 }  // r
 
                 vst1q_f32(out_ptr0_base + out_offset, vo0);
+                vst1q_f32(out_ptr1_base + out_offset, vo1);
+                vst1q_f32(out_ptr2_base + out_offset, vo2);
+                vst1q_f32(out_ptr3_base + out_offset, vo3);
+
                 filter_ptr0 -= 49;
+                filter_ptr1 -= 49;
+                filter_ptr2 -= 49;
+                filter_ptr3 -= 49;
               }  // w
             }    // h
           }  // c
-        }    // mm
-      }      // if
-    }        // m
-  }          // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float
+                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t in_h = h * 3;
+                  index_t in_w = w * 3;
+                  index_t in_offset = in_h * in_width + in_w;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float32x4_t vo0;
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
+                  for (index_t r = 0; r < 7; ++r) {
+                    // input (3 slide)
+                    float32x4x3_t vvi0, vvi1;  // to de-interleave
+                    float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6;
+                    // load input
+                    // [0.3.6.9, 1.4.7.10, 2.5.8.11]
+                    vvi0 = vld3q_f32(in_ptr_base + in_offset);
+                    // [12.15.xx.xx, 13.xx.xx.xx, 14.xx.xx.xx]
+                    vvi1 = vld3q_f32(in_ptr_base + in_offset + 12);
+                    vi0 = vvi0.val[0];                     // [0.3.6.9]
+                    vi1 = vvi0.val[1];                     // [1.4.7.10]
+                    vi2 = vvi0.val[2];                     // [2.5.8.11]
+                    vi3 = vextq_f32(vi0, vvi1.val[0], 1);  // [3.6.9.12]
+                    vi4 = vextq_f32(vi1, vvi1.val[1], 1);  // [4.7.10.13]
+                    vi5 = vextq_f32(vi2, vvi1.val[2], 1);  // [5.8.11.14]
+                    vi6 = vextq_f32(vi0, vvi1.val[0], 2);  // [6.9.12.15]
+
+#if defined(__aarch64__)
+                    MACE_Conv2dArmv8NeonK7x7SnLoadCalc1;
+#else
+                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
+#endif
+
+                    in_offset += in_width;
+                    filter_ptr0 += 7;
+                  }  // r
+
+                  vst1q_f32(out_ptr0_base + out_offset, vo0);
+                  filter_ptr0 -= 49;
+                }  // w
+              }    // h
+            }  // c
+          }    // mm
+        }      // if
+      }        // m
+    }          // b
+  }, 0, batch, 1, 0, out_channels, 4);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/fp32/conv_2d_7x7.h
index e64780bab2bb4c22c2107da29d85b9040ef86460..9324f4daac2392cb069935d3d46fc36274e8b8ea 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
@@ -28,7 +28,7 @@ namespace fp32 {
 
 class Conv2dK7x7S1 : public Conv2dBase {
  public:
-  Conv2dK7x7S1(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK7x7S1(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK7x7S1() {}
 
@@ -36,12 +36,12 @@ class Conv2dK7x7S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK7x7S2 : public Conv2dBase {
  public:
-  Conv2dK7x7S2(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK7x7S2(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK7x7S2() {}
 
@@ -49,12 +49,12 @@ class Conv2dK7x7S2 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class Conv2dK7x7S3 : public Conv2dBase {
  public:
-  Conv2dK7x7S3(const std::vector<int> paddings, const Padding padding_type)
+  Conv2dK7x7S3(const std::vector<int> &paddings, const Padding padding_type)
       : Conv2dBase({3, 3}, {1, 1}, paddings, padding_type) {}
   virtual ~Conv2dK7x7S3() {}
 
@@ -62,7 +62,7 @@ class Conv2dK7x7S3 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_general.cc
index a12c5d53b83c275a470f04accdeee07d65317330..25fb2441481cb5ac55da78e44327478b513de018 100644
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
 #include "mace/ops/arm/fp32/conv_general.h"
 
+#include <memory>
+
 namespace mace {
 namespace ops {
 namespace arm {
@@ -37,11 +38,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                        &padded_output);
 
   const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
+  if (padded_input != nullptr) {
     in_tensor = padded_input.get();
   }
   Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
+  if (padded_output != nullptr) {
     out_tensor = padded_output.get();
   }
   out_tensor->Clear();
@@ -53,148 +54,70 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
   auto input_data = in_tensor->data<float>();
   auto output_data = out_tensor->mutable_data<float>();
 
-  auto in_shape = in_tensor->shape();
-  auto out_shape = out_tensor->shape();
-  auto filter_shape = filter->shape();
-
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = filter_shape[1] * in_image_size;
-  const index_t out_batch_size = filter_shape[0] * out_image_size;
-  const index_t filter_size = filter_shape[2] * filter_shape[3];
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < in_shape[0]; b++) {
-    for (index_t m = 0; m < filter_shape[0]; m += 4) {
-      const index_t in_width = in_shape[3];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t out_channels = filter_shape[0];
-      const index_t in_channels = filter_shape[1];
-
-      const int stride_h = strides_[0];
-      const int stride_w = strides_[1];
-      const int dilation_h = dilations_[0];
-      const int dilation_w = dilations_[1];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base =
-            output_data + b * out_batch_size + m * out_image_size;
-        float *out_ptr1_base = out_ptr0_base + out_image_size;
-        float *out_ptr2_base = out_ptr1_base + out_image_size;
-        float *out_ptr3_base = out_ptr2_base + out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 =
-              filter_data + m * in_channels * filter_size + c * filter_size;
-          const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
-          const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
-          const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // input offset
-              index_t ih = h * stride_h;
-              index_t iw = w * stride_w;
-              index_t in_offset = ih * in_width + iw;
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float vo0[4], vo1[4], vo2[4], vo3[4];
-              // load output
-              index_t out_offset = h * out_width + w;
-              for (index_t ow = 0; ow < 4; ++ow) {
-                vo0[ow] = out_ptr0_base[out_offset + ow];
-                vo1[ow] = out_ptr1_base[out_offset + ow];
-                vo2[ow] = out_ptr2_base[out_offset + ow];
-                vo3[ow] = out_ptr3_base[out_offset + ow];
-              }
-              // calc by row
-              for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                  // outch 0
-                  vo0[0] += in_ptr_base[in_offset
-                      + kw * dilation_w] * filter_ptr0[kw];
-                  vo0[1] += in_ptr_base[in_offset + stride_w
-                      + kw * dilation_w] * filter_ptr0[kw];
-                  vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                      + kw * dilation_w] * filter_ptr0[kw];
-                  vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                      + kw * dilation_w] * filter_ptr0[kw];
-                  // outch 1
-                  vo1[0] += in_ptr_base[in_offset
-                      + kw * dilation_w] * filter_ptr1[kw];
-                  vo1[1] += in_ptr_base[in_offset + stride_w
-                      + kw * dilation_w] * filter_ptr1[kw];
-                  vo1[2] += in_ptr_base[in_offset + 2 * stride_w
-                      + kw * dilation_w] * filter_ptr1[kw];
-                  vo1[3] += in_ptr_base[in_offset + 3 * stride_w
-                      + kw * dilation_w] * filter_ptr1[kw];
-                  // outch 2
-                  vo2[0] += in_ptr_base[in_offset
-                      + kw * dilation_w] * filter_ptr2[kw];
-                  vo2[1] += in_ptr_base[in_offset + stride_w
-                      + kw * dilation_w] * filter_ptr2[kw];
-                  vo2[2] += in_ptr_base[in_offset + 2 * stride_w
-                      + kw * dilation_w] * filter_ptr2[kw];
-                  vo2[3] += in_ptr_base[in_offset + 3 * stride_w
-                      + kw * dilation_w] * filter_ptr2[kw];
-                  // outch 3
-                  vo3[0] += in_ptr_base[in_offset
-                      + kw * dilation_w] * filter_ptr3[kw];
-                  vo3[1] += in_ptr_base[in_offset + stride_w
-                      + kw * dilation_w] * filter_ptr3[kw];
-                  vo3[2] += in_ptr_base[in_offset + 2 * stride_w
-                      + kw * dilation_w] * filter_ptr3[kw];
-                  vo3[3] += in_ptr_base[in_offset + 3 * stride_w
-                      + kw * dilation_w] * filter_ptr3[kw];
-                }  // kw
-
-                in_offset += dilation_h * in_width;
-                filter_ptr0 += filter_shape[3];
-                filter_ptr1 += filter_shape[3];
-                filter_ptr2 += filter_shape[3];
-                filter_ptr3 += filter_shape[3];
-              }  // kh
-
-              for (index_t ow = 0; ow < 4; ++ow) {
-                out_ptr0_base[out_offset + ow] = vo0[ow];
-                out_ptr1_base[out_offset + ow] = vo1[ow];
-                out_ptr2_base[out_offset + ow] = vo2[ow];
-                out_ptr3_base[out_offset + ow] = vo3[ow];
-              }
-
-              filter_ptr0 -= filter_size;
-              filter_ptr1 -= filter_size;
-              filter_ptr2 -= filter_size;
-              filter_ptr3 -= filter_size;
-            }  // w
-          }  // h
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+  auto &filter_shape = filter->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t filter_height = filter_shape[2];
+  const index_t filter_width = filter_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t filter_size = filter_height * filter_width;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        const int stride_h = strides_[0];
+        const int stride_w = strides_[1];
+        const int dilation_h = dilations_[0];
+        const int dilation_w = dilations_[1];
+        if (m + 3 < out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + mm * out_image_size;
+              output_data + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input_data + b * in_batch_size + c * in_image_size;
             const float *filter_ptr0 =
-                filter_data + mm * in_channels * filter_size + c * filter_size;
-
+                filter_data + m * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
                 index_t ih = h * stride_h;
                 index_t iw = w * stride_w;
                 index_t in_offset = ih * in_width + iw;
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float vo0[4];
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float vo0[4], vo1[4], vo2[4], vo3[4];
                 // load output
                 index_t out_offset = h * out_width + w;
                 for (index_t ow = 0; ow < 4; ++ow) {
                   vo0[ow] = out_ptr0_base[out_offset + ow];
+                  vo1[ow] = out_ptr1_base[out_offset + ow];
+                  vo2[ow] = out_ptr2_base[out_offset + ow];
+                  vo3[ow] = out_ptr3_base[out_offset + ow];
                 }
-
                 // calc by row
-                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                for (index_t kh = 0; kh < filter_height; ++kh) {
+                  for (index_t kw = 0; kw < filter_width; ++kw) {
                     // outch 0
                     vo0[0] += in_ptr_base[in_offset
                         + kw * dilation_w] * filter_ptr0[kw];
@@ -204,23 +127,111 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                         + kw * dilation_w] * filter_ptr0[kw];
                     vo0[3] += in_ptr_base[in_offset + 3 * stride_w
                         + kw * dilation_w] * filter_ptr0[kw];
+                    // outch 1
+                    vo1[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    // outch 2
+                    vo2[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    // outch 3
+                    vo3[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
                   }  // kw
 
                   in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_shape[3];
+                  filter_ptr0 += filter_width;
+                  filter_ptr1 += filter_width;
+                  filter_ptr2 += filter_width;
+                  filter_ptr3 += filter_width;
                 }  // kh
 
                 for (index_t ow = 0; ow < 4; ++ow) {
                   out_ptr0_base[out_offset + ow] = vo0[ow];
+                  out_ptr1_base[out_offset + ow] = vo1[ow];
+                  out_ptr2_base[out_offset + ow] = vo2[ow];
+                  out_ptr3_base[out_offset + ow] = vo3[ow];
                 }
+
                 filter_ptr0 -= filter_size;
+                filter_ptr1 -= filter_size;
+                filter_ptr2 -= filter_size;
+                filter_ptr3 -= filter_size;
               }  // w
             }  // h
           }  // c
-        }  // mm
-      }  // if
-    }  // m
-  }  // b
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output_data + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input_data + b * in_batch_size + c * in_image_size;
+              const float *filter_ptr0 =
+                  filter_data + mm * in_channels * filter_size
+                      + c * filter_size;
+
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t ih = h * stride_h;
+                  index_t iw = w * stride_w;
+                  index_t in_offset = ih * in_width + iw;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float vo0[4];
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  for (index_t ow = 0; ow < 4; ++ow) {
+                    vo0[ow] = out_ptr0_base[out_offset + ow];
+                  }
+
+                  // calc by row
+                  for (index_t kh = 0; kh < filter_height; ++kh) {
+                    for (index_t kw = 0; kw < filter_width; ++kw) {
+                      // outch 0
+                      vo0[0] += in_ptr_base[in_offset
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[1] += in_ptr_base[in_offset + stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                    }  // kw
+
+                    in_offset += dilation_h * in_width;
+                    filter_ptr0 += filter_width;
+                  }  // kh
+
+                  for (index_t ow = 0; ow < 4; ++ow) {
+                    out_ptr0_base[out_offset + ow] = vo0[ow];
+                  }
+                  filter_ptr0 -= filter_size;
+                }  // w
+              }  // h
+            }  // c
+          }  // mm
+        }  // if
+      }  // m
+    }  // b
+  }, 0, batch, 1, 0, out_channels, 4);
 
   UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
diff --git a/mace/ops/arm/fp32/conv_general.h b/mace/ops/arm/fp32/conv_general.h
index 01d019548a19fee9c79deb6d918dac9431110fac..115acdb3fe83cb80e1e20e7939c5fe03eed7c6da 100644
--- a/mace/ops/arm/fp32/conv_general.h
+++ b/mace/ops/arm/fp32/conv_general.h
@@ -28,9 +28,9 @@ namespace fp32 {
 
 class Conv2dGeneral : public Conv2dBase {
  public:
-  Conv2dGeneral(const std::vector<int> strides,
-                const std::vector<int> dilations,
-                const std::vector<int> paddings,
+  Conv2dGeneral(const std::vector<int> &strides,
+                const std::vector<int> &dilations,
+                const std::vector<int> &paddings,
                 const Padding padding_type)
       : Conv2dBase(strides, dilations, paddings, padding_type) {}
   virtual ~Conv2dGeneral() {}
@@ -39,7 +39,7 @@ class Conv2dGeneral : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/fp32/deconv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a80d6d645b15720a4210de9c9cdab3fc9c8401b9
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d.cc
@@ -0,0 +1,120 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/deconv_2d.h"
+
+#include <utility>
+#include <functional>
+#include "mace/utils/memory.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Deconv2dBase::ResizeOutAndPadOut(
+    const OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *output_shape,
+    Tensor *output,
+    std::vector<int> *out_pad_size,
+    std::unique_ptr<Tensor> *padded_output) {
+  std::vector<index_t> out_shape;
+  if (output_shape) {
+    Tensor::MappingGuard out_shape_guard(output_shape);
+    MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims");
+    out_shape =
+        std::vector<index_t>(output_shape->data<int32_t>(),
+                             output_shape->data<int32_t>() + 4);
+  }
+
+  std::vector<index_t> padded_out_shape;
+
+  CalDeconvOutputShapeAndPadSize(input->shape(),
+                                 filter->shape(),
+                                 strides_,
+                                 padding_type_,
+                                 paddings_,
+                                 group_,
+                                 &out_shape,
+                                 nullptr,
+                                 out_pad_size,
+                                 &padded_out_shape,
+                                 framework_type_,
+                                 NCHW);
+
+  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+  const bool is_out_padded =
+      padded_out_shape[2] != out_shape[2]
+          || padded_out_shape[3] != out_shape[3];
+
+  if (is_out_padded) {
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    index_t scratch_size = PadAlignSize(padded_out_size);
+    scratch->GrowSize(scratch_size);
+
+    std::unique_ptr<Tensor>
+        padded_out
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+    padded_out->Reshape(padded_out_shape);
+    *padded_output = std::move(padded_out);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Deconv2dBase::UnPadOutput(const Tensor &src,
+                               const std::vector<int> &out_pad_size,
+                               Tensor *dst) {
+  if (dst == &src) return;
+  const index_t pad_h = out_pad_size[0] / 2;
+  const index_t pad_w = out_pad_size[1] / 2;
+
+  const index_t batch = dst->dim(0);
+  const index_t channels = dst->dim(1);
+  const index_t height = dst->dim(2);
+  const index_t width = dst->dim(3);
+  const index_t padded_height = src.dim(2);
+  const index_t padded_width = src.dim(3);
+
+  auto padded_out_data = src.data<float>();
+  auto out_data = dst->mutable_data<float>();
+
+  for (index_t i = 0; i < batch; ++i) {
+    for (index_t j = 0; j < channels; ++j) {
+      for (index_t k = 0; k < height; ++k) {
+        const float *input_base =
+            padded_out_data + ((i * channels + j) * padded_height
+                + (k + pad_h)) * padded_width;
+        float *output_base =
+            out_data + ((i * channels + j) * height + k) * width;
+        memcpy(output_base, input_base + pad_w, width * sizeof(float));
+      }
+    }
+  }
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d.h b/mace/ops/arm/fp32/deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..554f2935992d0a6f901bbb7b40aab4b048d63616
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d.h
@@ -0,0 +1,95 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_
+#define MACE_OPS_ARM_FP32_DECONV_2D_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Deconv2dBase {
+ public:
+  Deconv2dBase(const std::vector<int> &strides,
+               const std::vector<int> &dilations,
+               const std::vector<int> &paddings,
+               const Padding padding_type,
+               const index_t group,
+               const FrameworkType framework_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type),
+        group_(group),
+        framework_type_(framework_type) {}
+
+  Deconv2dBase(const std::vector<int> &strides,
+               const std::vector<int> &dilations,
+               const std::vector<int> &paddings,
+               const Padding padding_type,
+               const FrameworkType framework_type)
+      : Deconv2dBase(strides,
+                     dilations,
+                     paddings,
+                     padding_type,
+                     1,
+                     framework_type) {}
+
+  virtual ~Deconv2dBase() = default;
+
+  virtual MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) = 0;
+
+ protected:
+  MaceStatus ResizeOutAndPadOut(const OpContext *context,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *output_shape,
+                                Tensor *output,
+                                std::vector<int> *out_pad_size,
+                                std::unique_ptr<Tensor> *padded_output);
+
+  void UnPadOutput(const Tensor &src,
+                   const std::vector<int> &out_pad_size,
+                   Tensor *dst);
+
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  index_t group_;
+  const FrameworkType framework_type_;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DECONV_2D_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9d630bbb63c66d72684663659965e32b2be6b60
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
@@ -0,0 +1,342 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
+
+#include <arm_neon.h>
+#include "mace/ops/arm/fp32/common_neon.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        if (oc + 1 < outch) {
+          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base1 = out_base0 + out_img_size;
+          for (index_t ic = 0; ic < inch; ++ic) {
+            const float *input_base = input_data + (b * inch + ic) * h * w;
+            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
+            const float *kernel_base1 = kernel_base0 + inch * 4;
+            const float *in = input_base;
+            // output channel 0
+            const float *k0 = kernel_base0;
+            // output channel 1
+            const float *k1 = kernel_base1;
+            // load filter
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+
+              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row1_0 = out_row_base1;
+              float *out_row1_1 = out_row_base1 + outw;
+
+              index_t j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                float32x4_t out00, out01, out02, out03;
+                float32x4_t out10, out11, out12, out13;
+
+                out00 = vld1q_f32(out_row0_0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                vst1q_f32(out_row0_0, out00);
+
+                out01 = vld1q_f32(out_row0_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                vst1q_f32(out_row0_0 + 1, out01);
+
+                out02 = vld1q_f32(out_row0_1);
+                out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                vst1q_f32(out_row0_1, out02);
+
+                out03 = vld1q_f32(out_row0_1 + 1);
+                out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                vst1q_f32(out_row0_1 + 1, out03);
+
+                out10 = vld1q_f32(out_row1_0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+                vst1q_f32(out_row1_0, out10);
+
+                out11 = vld1q_f32(out_row1_0 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+                vst1q_f32(out_row1_0 + 1, out11);
+
+                out12 = vld1q_f32(out_row1_1);
+                out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+                vst1q_f32(out_row1_1, out12);
+
+                out13 = vld1q_f32(out_row1_1 + 1);
+                out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+                vst1q_f32(out_row1_1 + 1, out13);
+
+                in += 4;
+                out_row0_0 += 4;
+                out_row0_1 += 4;
+                out_row1_0 += 4;
+                out_row1_1 += 4;
+              }
+
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 2; ++k) {
+                  out_row0_0[k] += val * k0[k];
+                  out_row0_1[k] += val * k0[k + 2];
+                  out_row1_0[k] += val * k1[k];
+                  out_row1_1[k] += val * k1[k + 2];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+                out_row1_0++;
+                out_row1_1++;
+              }
+            }
+          }
+        } else {
+          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
+          for (index_t ic = 0; ic < inch; ++ic) {
+            const float *input_base = input_data + (b * inch + ic) * h * w;
+            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
+            const float *in = input_base;
+            const float *k0 = kernel_base0;
+
+            // load filter
+            float32x4_t k0_vec = vld1q_f32(k0);
+
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+              index_t j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+                float32x4_t out00, out01, out02, out03;
+
+                out00 = vld1q_f32(out_row0_0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                vst1q_f32(out_row0_0, out00);
+
+                out01 = vld1q_f32(out_row0_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                vst1q_f32(out_row0_0 + 1, out01);
+
+                out02 = vld1q_f32(out_row0_1);
+                out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                vst1q_f32(out_row0_1, out02);
+
+                out03 = vld1q_f32(out_row0_1 + 1);
+                out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                vst1q_f32(out_row0_1 + 1, out03);
+
+                in += 4;
+                out_row0_0 += 4;
+                out_row0_1 += 4;
+              }
+
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 2; ++k) {
+                  out_row0_0[k] += val * k0[k];
+                  out_row0_1[k] += val * k0[k + 2];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 2);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input_data + (b * inch + ic) * h * w;
+          const float *kernel_base = filter_data + (oc * inch + ic) * 4;
+          const float *in = input_base;
+          const float *k0 = kernel_base;
+          float32x4_t k0_vec = vld1q_f32(k0);
+
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base = out_base + i * 2 * outw;
+            float *out_row_0 = out_row_base;
+            float *out_row_1 = out_row_0 + outw;
+
+            index_t j = 0;
+
+            for (; j + 3 < w; j += 4) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              // out row 0
+              float32x4x2_t out00 = vld2q_f32(out_row_0);
+              out00.val[0] =
+                  neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+              out00.val[1] =
+                  neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+              vst2q_f32(out_row_0, out00);
+
+              // out row 1
+              float32x4x2_t out10 = vld2q_f32(out_row_1);
+              out10.val[0] =
+                  neon_vfma_lane_2(out10.val[0], in_vec, k0_vec);
+              out10.val[1] =
+                  neon_vfma_lane_3(out10.val[1], in_vec, k0_vec);
+              vst2q_f32(out_row_1, out10);
+
+              in += 4;
+              out_row_0 += 8;
+              out_row_1 += 8;
+            }
+
+            for (; j < w; ++j) {
+              float val = in[0];
+              for (int k = 0; k < 2; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k0[k + 2];
+              }
+              in++;
+              out_row_0 += 2;
+              out_row_1 += 2;
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 1);
+
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.h b/mace/ops/arm/fp32/deconv_2d_2x2.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f80dece27fd6cf20d87861e04a512b94706939
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.h
@@ -0,0 +1,70 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Deconv2dK2x2S1 : public Deconv2dBase {
+ public:
+  Deconv2dK2x2S1(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK2x2S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class Deconv2dK2x2S2 : public Deconv2dBase {
+ public:
+  Deconv2dK2x2S2(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK2x2S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2ef6eae269316c9169e33bbb753606d8572c1ff
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
@@ -0,0 +1,470 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
+
+#include <arm_neon.h>
+#include "mace/ops/arm/fp32/common_neon.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = out_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        if (oc + 1 < outch) {
+          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base1 = out_base0 + out_img_size;
+          for (index_t ic = 0; ic < inch; ++ic) {
+            const float *input_base = input_data + (b * inch + ic) * h * w;
+            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
+            const float *kernel_base1 = kernel_base0 + inch * 9;
+            const float *in = input_base;
+
+            // output channel 0
+            const float *k0_0 = kernel_base0;
+            const float *k0_1 = kernel_base0 + 3;
+            const float *k0_2 = kernel_base0 + 5;
+            // output channel 1
+            const float *k1_0 = kernel_base1;
+            const float *k1_1 = kernel_base1 + 3;
+            const float *k1_2 = kernel_base1 + 5;
+
+            // load filter
+            float32x4_t k00_vec, k01_vec, k02_vec;
+            float32x4_t k10_vec, k11_vec, k12_vec;
+
+            k00_vec = vld1q_f32(k0_0);
+            k01_vec = vld1q_f32(k0_1);
+            k02_vec = vld1q_f32(k0_2);
+
+            k10_vec = vld1q_f32(k1_0);
+            k11_vec = vld1q_f32(k1_1);
+            k12_vec = vld1q_f32(k1_2);
+
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_2 = out_row_base0 + 2 * outw;
+
+              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row1_0 = out_row_base1;
+              float *out_row1_1 = out_row_base1 + outw;
+              float *out_row1_2 = out_row_base1 + 2 * outw;
+
+              index_t j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                float32x4_t out00, out01, out02;
+                float32x4_t out10, out11, out12;
+                float32x4_t out20, out21, out22;
+
+                out00 = vld1q_f32(out_row0_0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+                vst1q_f32(out_row0_0, out00);
+
+                out01 = vld1q_f32(out_row0_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+                vst1q_f32(out_row0_0 + 1, out01);
+
+                out02 = vld1q_f32(out_row0_0 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+                vst1q_f32(out_row0_0 + 2, out02);
+
+                out10 = vld1q_f32(out_row0_1 + 0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 0, out10);
+
+                out11 = vld1q_f32(out_row0_1 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 1, out11);
+
+                out12 = vld1q_f32(out_row0_1 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 2, out12);
+
+                out20 = vld1q_f32(out_row0_2 + 0);
+                out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 0, out20);
+
+                out21 = vld1q_f32(out_row0_2 + 1);
+                out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 1, out21);
+
+                out22 = vld1q_f32(out_row0_2 + 2);
+                out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 2, out22);
+
+                out00 = vld1q_f32(out_row1_0 + 0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 0, out00);
+
+                out01 = vld1q_f32(out_row1_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 1, out01);
+
+                out02 = vld1q_f32(out_row1_0 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 2, out02);
+
+                out10 = vld1q_f32(out_row1_1 + 0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 0, out10);
+
+                out11 = vld1q_f32(out_row1_1 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 1, out11);
+
+                out12 = vld1q_f32(out_row1_1 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 2, out12);
+
+                out20 = vld1q_f32(out_row1_2 + 0);
+                out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 0, out20);
+
+                out21 = vld1q_f32(out_row1_2 + 1);
+                out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 1, out21);
+
+                out22 = vld1q_f32(out_row1_2 + 2);
+                out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 2, out22);
+
+                in += 4;
+                out_row0_0 += 4;
+                out_row0_1 += 4;
+                out_row0_2 += 4;
+                out_row1_0 += 4;
+                out_row1_1 += 4;
+                out_row1_2 += 4;
+              }
+
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 3; ++k) {
+                  out_row0_0[k] += val * k0_0[k];
+                  out_row0_1[k] += val * k0_1[k];
+                  out_row0_2[k] += val * k0_2[k + 1];
+                  out_row1_0[k] += val * k1_0[k];
+                  out_row1_1[k] += val * k1_1[k];
+                  out_row1_2[k] += val * k1_2[k + 1];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+                out_row0_2++;
+                out_row1_0++;
+                out_row1_1++;
+                out_row1_2++;
+              }
+            }
+          }
+        } else {
+          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
+          for (index_t ic = 0; ic < inch; ++ic) {
+            const float *input_base = input_data + (b * inch + ic) * h * w;
+            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
+            const float *in = input_base;
+            const float *k0_0 = kernel_base0;
+            const float *k0_1 = kernel_base0 + 3;
+            const float *k0_2 = kernel_base0 + 5;
+
+            // load filter
+            float32x4_t k00_vec = vld1q_f32(k0_0);
+            float32x4_t k01_vec = vld1q_f32(k0_1);
+            float32x4_t k02_vec = vld1q_f32(k0_2);
+
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row0_0 = out_row_base0;
+              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_2 = out_row_base0 + 2 * outw;
+              index_t j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                float32x4_t out00, out01, out02;
+                float32x4_t out10, out11, out12;
+                float32x4_t out20, out21, out22;
+
+                out00 = vld1q_f32(out_row0_0 + 0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+                vst1q_f32(out_row0_0 + 0, out00);
+
+                out01 = vld1q_f32(out_row0_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+                vst1q_f32(out_row0_0 + 1, out01);
+
+                out02 = vld1q_f32(out_row0_0 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+                vst1q_f32(out_row0_0 + 2, out02);
+
+                out10 = vld1q_f32(out_row0_1 + 0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 0, out10);
+
+                out11 = vld1q_f32(out_row0_1 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 1, out11);
+
+                out12 = vld1q_f32(out_row0_1 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+                vst1q_f32(out_row0_1 + 2, out12);
+
+                out20 = vld1q_f32(out_row0_2 + 0);
+                out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 0, out20);
+
+                out21 = vld1q_f32(out_row0_2 + 1);
+                out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 1, out21);
+
+                out22 = vld1q_f32(out_row0_2 + 2);
+                out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+                vst1q_f32(out_row0_2 + 2, out22);
+
+                in += 4;
+                out_row0_0 += 4;
+                out_row0_1 += 4;
+                out_row0_2 += 4;
+              }
+
+              for (; j < w; ++j) {
+                float val = in[0];
+                for (int k = 0; k < 3; ++k) {
+                  out_row0_0[k] += val * k0_0[k];
+                  out_row0_1[k] += val * k0_1[k];
+                  out_row0_2[k] += val * k0_2[k + 1];
+                }
+                in++;
+                out_row0_0++;
+                out_row0_1++;
+                out_row0_2++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 2);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+        for (index_t ic = 0; ic < inch; ++ic) {
+          const float *input_base = input_data + (b * inch + ic) * h * w;
+          const float *kernel_base = filter_data + (oc * inch + ic) * 9;
+          const float *in = input_base;
+
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 3;
+          const float *k2 = kernel_base + 5;
+
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+          float32x4_t k2_vec = vld1q_f32(k2);
+
+          for (index_t i = 0; i < h; ++i) {
+            float *out_row_base = out_base + i * 2 * outw;
+            float *out_row_0 = out_row_base;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+
+            index_t j = 0;
+
+            for (index_t n = 0; n + 9 < outw; n += 8) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              // out row 0
+              float32x4x2_t out00 = vld2q_f32(out_row_0);
+              out00.val[0] =
+                  neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+              out00.val[1] =
+                  neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+              vst2q_f32(out_row_0, out00);
+
+              float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+              out01.val[0] =
+                  neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+              vst2q_f32(out_row_0 + 2, out01);
+
+              // out row 1
+              float32x4x2_t out10 = vld2q_f32(out_row_1);
+              out10.val[0] =
+                  neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+              out10.val[1] =
+                  neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+              vst2q_f32(out_row_1, out10);
+
+              float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+              out11.val[0] =
+                  neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+              vst2q_f32(out_row_1 + 2, out11);
+
+              // out row 2
+              float32x4x2_t out20 = vld2q_f32(out_row_2);
+              out20.val[0] =
+                  neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+              out20.val[1] =
+                  neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+              vst2q_f32(out_row_2, out20);
+
+              float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+              out21.val[0] =
+                  neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+              vst2q_f32(out_row_2 + 2, out21);
+
+              in += 4;
+              out_row_0 += 8;
+              out_row_1 += 8;
+              out_row_2 += 8;
+              j += 4;
+            }
+
+            for (; j < w; ++j) {
+              float val = in[0];
+
+              for (int k = 0; k < 3; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k + 1];
+              }
+
+              in++;
+              out_row_0 += 2;
+              out_row_1 += 2;
+              out_row_2 += 2;
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.h b/mace/ops/arm/fp32/deconv_2d_3x3.h
new file mode 100644
index 0000000000000000000000000000000000000000..4495cbe8e4ef5fa3b05c72e9970fa05fb67a7fbb
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.h
@@ -0,0 +1,70 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Deconv2dK3x3S1 : public Deconv2dBase {
+ public:
+  Deconv2dK3x3S1(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class Deconv2dK3x3S2 : public Deconv2dBase {
+ public:
+  Deconv2dK3x3S2(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c47ecff71bc46ea02aa73cb49d511a22c61ba27
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
@@ -0,0 +1,581 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
+
+#include <arm_neon.h>
+#include "mace/ops/arm/fp32/common_neon.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        if (oc + 1 < outch) {
+          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base1 = out_base + out_img_size;
+          for (index_t q = 0; q < inch; q++) {
+            const float *input_base = input_data + (b * inch + q) * h * w;
+            const float *in = input_base;
+            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 4;
+            const float *k2 = kernel_base + 8;
+            const float *k3 = kernel_base + 12;
+
+            const float *kernel_base1 = kernel_base + inch * 16;
+            const float *k10 = kernel_base1;
+            const float *k11 = kernel_base1 + 4;
+            const float *k12 = kernel_base1 + 8;
+            const float *k13 = kernel_base1 + 12;
+
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+            float32x4_t k3_vec = vld1q_f32(k3);
+
+            float32x4_t k10_vec = vld1q_f32(k10);
+            float32x4_t k11_vec = vld1q_f32(k11);
+            float32x4_t k12_vec = vld1q_f32(k12);
+            float32x4_t k13_vec = vld1q_f32(k13);
+
+            for (index_t i = 0; i < h; i++) {
+              float *out_row = out_base + i * outw;
+
+              float *out_row_0 = out_row;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+              float *out_row_3 = out_row_2 + outw;
+
+              float *out_row1 = out_base1 + i * outw;
+
+              float *out_row1_0 = out_row1;
+              float *out_row1_1 = out_row1_0 + outw;
+              float *out_row1_2 = out_row1_1 + outw;
+              float *out_row1_3 = out_row1_2 + outw;
+
+              index_t j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+                float32x4_t out00, out01, out02, out03;
+                float32x4_t out10, out11, out12, out13;
+
+                out00 = vld1q_f32(out_row_0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                vst1q_f32(out_row_0, out00);
+
+                out10 = vld1q_f32(out_row1_0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
+                vst1q_f32(out_row1_0, out10);
+
+                out01 = vld1q_f32(out_row_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 1, out01);
+
+                out11 = vld1q_f32(out_row1_0 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 1, out11);
+
+                out02 = vld1q_f32(out_row_0 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 2, out02);
+
+                out12 = vld1q_f32(out_row1_0 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 2, out12);
+
+                out03 = vld1q_f32(out_row_0 + 3);
+                out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 3, out03);
+
+                out13 = vld1q_f32(out_row1_0 + 3);
+                out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
+                vst1q_f32(out_row1_0 + 3, out13);
+
+                out00 = vld1q_f32(out_row_1);
+                out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
+                vst1q_f32(out_row_1, out00);
+
+                out10 = vld1q_f32(out_row1_1);
+                out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+                vst1q_f32(out_row1_1, out10);
+
+                out01 = vld1q_f32(out_row_1 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 1, out01);
+
+                out11 = vld1q_f32(out_row1_1 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 1, out11);
+
+                out02 = vld1q_f32(out_row_1 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 2, out02);
+
+                out12 = vld1q_f32(out_row1_1 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 2, out12);
+
+                out03 = vld1q_f32(out_row_1 + 3);
+                out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 3, out03);
+
+                out13 = vld1q_f32(out_row1_1 + 3);
+                out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
+                vst1q_f32(out_row1_1 + 3, out13);
+
+                out00 = vld1q_f32(out_row_2 + 0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 0, out00);
+
+                out10 = vld1q_f32(out_row1_2 + 0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 0, out10);
+
+                out01 = vld1q_f32(out_row_2 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 1, out01);
+
+                out11 = vld1q_f32(out_row1_2 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 1, out11);
+
+                out02 = vld1q_f32(out_row_2 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 2, out02);
+
+                out12 = vld1q_f32(out_row1_2 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 2, out12);
+
+                out03 = vld1q_f32(out_row_2 + 3);
+                out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 3, out03);
+
+                out13 = vld1q_f32(out_row1_2 + 3);
+                out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
+                vst1q_f32(out_row1_2 + 3, out13);
+
+                out00 = vld1q_f32(out_row_3 + 0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 0, out00);
+
+                out10 = vld1q_f32(out_row1_3 + 0);
+                out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
+                vst1q_f32(out_row1_3 + 0, out10);
+
+                out01 = vld1q_f32(out_row_3 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 1, out01);
+
+                out11 = vld1q_f32(out_row1_3 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
+                vst1q_f32(out_row1_3 + 1, out11);
+
+                out02 = vld1q_f32(out_row_3 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 2, out02);
+
+                out12 = vld1q_f32(out_row1_3 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
+                vst1q_f32(out_row1_3 + 2, out12);
+
+                out03 = vld1q_f32(out_row_3 + 3);
+                out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 3, out03);
+
+                out13 = vld1q_f32(out_row1_3 + 3);
+                out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
+                vst1q_f32(out_row1_3 + 3, out13);
+
+                in += 4;
+                out_row_0 += 4;
+                out_row_1 += 4;
+                out_row_2 += 4;
+                out_row_3 += 4;
+                out_row1_0 += 4;
+                out_row1_1 += 4;
+                out_row1_2 += 4;
+                out_row1_3 += 4;
+              }
+
+              for (; j < w; j++) {
+                float val = in[0];
+                for (int k = 0; k < 4; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k];
+                  out_row_3[k] += val * k3[k];
+                  out_row1_0[k] += val * k10[k];
+                  out_row1_1[k] += val * k11[k];
+                  out_row1_2[k] += val * k12[k];
+                  out_row1_3[k] += val * k13[k];
+                }
+                in++;
+                out_row_0++;
+                out_row_1++;
+                out_row_2++;
+                out_row_3++;
+                out_row1_0++;
+                out_row1_1++;
+                out_row1_2++;
+                out_row1_3++;
+              }
+            }
+          }
+        } else {
+          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+          for (index_t q = 0; q < inch; q++) {
+            const float *input_base = input_data + (b * inch + q) * h * w;
+            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+            const float *in = input_base;
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 4;
+            const float *k2 = kernel_base + 8;
+            const float *k3 = kernel_base + 12;
+
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+            float32x4_t k3_vec = vld1q_f32(k3);
+
+            for (index_t i = 0; i < h; i++) {
+              float *out_row = out_base + i * outw;
+              float *out_row_0 = out_row;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+              float *out_row_3 = out_row_2 + outw;
+              int j = 0;
+
+              for (; j + 3 < w; j += 4) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                float32x4_t out00 = vld1q_f32(out_row_0);
+                out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                vst1q_f32(out_row_0, out00);
+
+                float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+                out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 1, out01);
+
+                float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+                out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 2, out02);
+
+                float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+                out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                vst1q_f32(out_row_0 + 3, out03);
+
+                float32x4_t out10 = vld1q_f32(out_row_1);
+                out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+                vst1q_f32(out_row_1, out10);
+
+                float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+                out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 1, out11);
+
+                float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+                out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 2, out12);
+
+                float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+                out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+                vst1q_f32(out_row_1 + 3, out13);
+
+                float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+                out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 0, out20);
+
+                float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+                out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 1, out21);
+
+                float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+                out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 2, out22);
+
+                float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+                out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+                vst1q_f32(out_row_2 + 3, out23);
+
+                float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+                out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 0, out30);
+
+                float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+                out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 1, out31);
+
+                float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+                out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 2, out32);
+
+                float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+                out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+                vst1q_f32(out_row_3 + 3, out33);
+
+                in += 4;
+                out_row_0 += 4;
+                out_row_1 += 4;
+                out_row_2 += 4;
+                out_row_3 += 4;
+              }
+
+              for (; j < w; j++) {
+                float val = in[0];
+                for (int k = 0; k < 4; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k];
+                  out_row_3[k] += val * k3[k];
+                }
+                in++;
+                out_row_0++;
+                out_row_1++;
+                out_row_2++;
+                out_row_3++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 2);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *output_shape,
+                                   Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t p = start1; p < end1; p += step1) {
+        float *out_base = padded_out_data + (b * outch + p) * out_img_size;
+        for (index_t q = 0; q < inch; q++) {
+          const float *input_base = input_data + (b * inch + q) * h * w;
+          const float *kernel_base = filter_data + (p * inch + q) * 16;
+          const float *in = input_base;
+
+          const float *k0 = kernel_base;
+          const float *k1 = kernel_base + 4;
+          const float *k2 = kernel_base + 8;
+          const float *k3 = kernel_base + 12;
+
+          float32x4_t k0_vec = vld1q_f32(k0);
+          float32x4_t k1_vec = vld1q_f32(k1);
+          float32x4_t k2_vec = vld1q_f32(k2);
+          float32x4_t k3_vec = vld1q_f32(k3);
+
+          for (index_t i = 0; i < h; i++) {
+            float *out_row = out_base + 2 * i * outw;
+
+            float *out_row_0 = out_row;
+            float *out_row_1 = out_row_0 + outw;
+            float *out_row_2 = out_row_1 + outw;
+            float *out_row_3 = out_row_2 + outw;
+
+            index_t j = 0;
+
+            for (index_t n = 0; n + 9 < outw; n += 8) {
+              float32x4_t in_vec = vld1q_f32(in);
+
+              // row 0
+              float32x4x2_t out0 = vld2q_f32(out_row_0);
+              out0.val[0] =
+                  neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+              out0.val[1] =
+                  neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+              vst2q_f32(out_row_0, out0);
+              out0 = vld2q_f32(out_row_0 + 2);
+              out0.val[0] =
+                  neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+              out0.val[1] =
+                  neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+              vst2q_f32(out_row_0 + 2, out0);
+
+              // row 1
+              float32x4x2_t out1 = vld2q_f32(out_row_1);
+              out1.val[0] =
+                  neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+              out1.val[1] =
+                  neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+              vst2q_f32(out_row_1, out1);
+              out1 = vld2q_f32(out_row_1 + 2);
+              out1.val[0] =
+                  neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+              out1.val[1] =
+                  neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+              vst2q_f32(out_row_1 + 2, out1);
+
+              // row 2
+              float32x4x2_t out2 = vld2q_f32(out_row_2);
+              out2.val[0] =
+                  neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+              out2.val[1] =
+                  neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+              vst2q_f32(out_row_2, out2);
+              out2 = vld2q_f32(out_row_2 + 2);
+              out2.val[0] =
+                  neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+              out2.val[1] =
+                  neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+              vst2q_f32(out_row_2 + 2, out2);
+
+              // row 3
+              float32x4x2_t out3 = vld2q_f32(out_row_3);
+              out3.val[0] =
+                  neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+              out3.val[1] =
+                  neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+              vst2q_f32(out_row_3, out3);
+              out3 = vld2q_f32(out_row_3 + 2);
+              out3.val[0] =
+                  neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+              out3.val[1] =
+                  neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+              vst2q_f32(out_row_3 + 2, out3);
+
+              in += 4;
+              out_row_0 += 8;
+              out_row_1 += 8;
+              out_row_2 += 8;
+              out_row_3 += 8;
+              j += 4;
+            }
+
+            for (; j < w; j++) {
+              float val = in[0];
+              for (int k = 0; k < 4; ++k) {
+                out_row_0[k] += val * k0[k];
+                out_row_1[k] += val * k1[k];
+                out_row_2[k] += val * k2[k];
+                out_row_3[k] += val * k3[k];
+              }
+              in++;
+              out_row_0 += 2;
+              out_row_1 += 2;
+              out_row_2 += 2;
+              out_row_3 += 2;
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, outch, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.h b/mace/ops/arm/fp32/deconv_2d_4x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f09056af0224331fca8815cca18a1f7eecdd1cc
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.h
@@ -0,0 +1,70 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Deconv2dK4x4S1 : public Deconv2dBase {
+ public:
+  Deconv2dK4x4S1(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({1, 1}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK4x4S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class Deconv2dK4x4S2 : public Deconv2dBase {
+ public:
+  Deconv2dK4x4S2(const std::vector<int> &paddings,
+                 const Padding padding_type,
+                 const FrameworkType framework_type)
+      : Deconv2dBase({2, 2}, {1, 1}, paddings, padding_type, framework_type) {}
+  virtual ~Deconv2dK4x4S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/fp32/deconv_2d_general.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47bfe39cf27adac58b1240afa66390fc23dc8866
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_general.cc
@@ -0,0 +1,117 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/deconv_2d_general.h"
+
+// TODO(liutuo): optimize it
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
+                                    const Tensor *input,
+                                    const Tensor *filter,
+                                    const Tensor *output_shape,
+                                    Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_img_size = out_height * out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+
+  const int kernel_size = static_cast<int>(kernel_h * kernel_w);
+  std::vector<index_t> index_map(kernel_size, 0);
+  for (index_t i = 0; i < kernel_h; ++i) {
+    for (index_t j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * out_width + j;
+    }
+  }
+
+  const index_t batch = in_shape[0];
+  const index_t out_channels = out_shape[1];
+  const index_t in_channels = in_shape[1];
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t oc = start1; oc < end1; oc += step1) {
+        float *out_base =
+            padded_out_data + (b * out_channels + oc) * out_img_size;
+        for (index_t i = 0; i < in_height; ++i) {
+          for (index_t j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides_[0] * out_width + j * strides_[1];
+            for (int ic = 0; ic < in_channels; ++ic) {
+              const index_t input_idx =
+                  (b * in_channels + ic) * in_img_size + i * in_width + j;
+              const float val = input_data[input_idx];
+              const index_t kernel_offset =
+                  (oc * in_channels + ic) * kernel_size;
+              for (int k = 0; k < kernel_size; ++k) {
+                const index_t out_idx = out_offset + index_map[k];
+                const index_t kernel_idx = kernel_offset + k;
+                out_base[out_idx] += val * filter_data[kernel_idx];
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, out_channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/deconv_2d_general.h b/mace/ops/arm/fp32/deconv_2d_general.h
new file mode 100644
index 0000000000000000000000000000000000000000..d11ada030c02c4f155aec12e0a162513cdae0c25
--- /dev/null
+++ b/mace/ops/arm/fp32/deconv_2d_general.h
@@ -0,0 +1,60 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  Deconv2dGeneral(const std::vector<int> &strides,
+                  const std::vector<int> &dilations,
+                  const std::vector<int> &paddings,
+                  const Padding padding_type,
+                  const FrameworkType framework_type)
+      : Deconv2dBase(strides,
+                     dilations,
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~Deconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DECONV_2D_GENERAL_H_
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
index 3ac8eb5de20503a89b9b25202b91ddbf8e305031..a27827b471818c049a09e532c059b56396e8f452 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <arm_neon.h>
 #include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
 
+#include <arm_neon.h>
+
 namespace mace {
 namespace ops {
 namespace arm {
@@ -64,14 +65,26 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
   const int pad_top = paddings[0] / 2;
   const int pad_left = paddings[1] / 2;
 
-  const index_t multiplier = out_shape[1] / in_shape[1];
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_channels / in_channels;
 
   std::vector<index_t> out_bounds;
   CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
+  const index_t valid_h_start = out_bounds[0];
+  const index_t valid_h_stop = out_bounds[1];
+  const index_t valid_w_start = out_bounds[2];
+  const index_t valid_w_stop = out_bounds[3];
 
   Tensor::MappingGuard in_guard(input);
   Tensor::MappingGuard filter_guard(filter);
@@ -80,159 +93,211 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
   auto input_data = input->data<float>();
   auto output_data = output->mutable_data<float>();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < in_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      const index_t c = m / multiplier;
-      const index_t multi_index = m % multiplier;
-      const float *in_base = input_data + b * in_batch_size + c * in_image_size;
-      const float
-          *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9;
-      float *out_base = output_data + b * out_batch_size + m * out_image_size;
-      index_t h, w;
-      const index_t out_width = out_shape[3];
-      const index_t in_height = in_shape[2];
-      const index_t in_width = in_shape[3];
-
-      const index_t valid_h_start = out_bounds[0];
-      const index_t valid_h_stop = out_bounds[1];
-      const index_t valid_w_start = out_bounds[2];
-      const index_t valid_w_stop = out_bounds[3];
-
-      // top
-      for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_shape[3]; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
-        }
-      }
-
-      // load filter (1 outch x 3 height x 3 width): vf_outch_height
-      float32x4_t vf00, vf01, vf02;
-      vf00 = vld1q_f32(filter_ptr);
-      vf01 = vld1q_f32(filter_ptr + 3);
-      vf02 = vld1q_f32(filter_ptr + 5);
-
-      for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
-        // left
-        for (w = 0; w < valid_w_start; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
-          DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        const index_t c = m / multiplier;
+        const index_t multi_index = m % multiplier;
+        const float
+            *in_base = input_data + b * in_batch_size + c * in_image_size;
+        const float
+            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
+        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+        index_t h, w;
+
+        // top
+        for (h = 0; h < valid_h_start; ++h) {
+          for (w = 0; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
         }
 
-        for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
-          // input (4 height x 3 slide): vi_height_slide
-          float32x4_t vi00, vi01, vi02, vi0n;
-          float32x4_t vi10, vi11, vi12, vi1n;
-          float32x4_t vi20, vi21, vi22, vi2n;
-          float32x4_t vi30, vi31, vi32, vi3n;
-
-          // output (1 outch x 2 height x 4 width): vo_outch_height
-          float32x4_t vo00, vo01;
-
-          // load input
-          index_t in_h = h - pad_top;
-          index_t in_w = w - pad_left;
-          index_t in_offset = in_h * in_width + in_w;
-          vi00 = vld1q_f32(in_base + in_offset);
-          vi0n = vld1q_f32(in_base + in_offset + 4);
-          vi10 = vld1q_f32(in_base + in_offset + in_width);
-          vi1n = vld1q_f32(in_base + in_offset + in_width + 4);
-          vi20 = vld1q_f32(in_base + in_offset + 2 * in_width);
-          vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4);
-          vi30 = vld1q_f32(in_base + in_offset + 3 * in_width);
-          vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4);
-
-          vi01 = vextq_f32(vi00, vi0n, 1);
-          vi02 = vextq_f32(vi00, vi0n, 2);
-          vi11 = vextq_f32(vi10, vi1n, 1);
-          vi12 = vextq_f32(vi10, vi1n, 2);
-          vi21 = vextq_f32(vi20, vi2n, 1);
-          vi22 = vextq_f32(vi20, vi2n, 2);
-          vi31 = vextq_f32(vi30, vi3n, 1);
-          vi32 = vextq_f32(vi30, vi3n, 2);
-
-          // load ouptut
-          index_t out_offset = h * out_width + w;
-          vo00 = vld1q_f32(out_base + out_offset);
-          vo01 = vld1q_f32(out_base + out_offset + out_width);
+        // load filter (1 outch x 3 height x 3 width): vf_outch_height
+        float32x4_t vf00, vf01, vf02;
+        vf00 = vld1q_f32(filter_ptr);
+        vf01 = vld1q_f32(filter_ptr + 3);
+        vf02 = vld1q_f32(filter_ptr + 5);
+
+        for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
+          // left
+          for (w = 0; w < valid_w_start; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h + 1,
+                                 w,
+                                 h + 1 - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
+
+          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+            // input (4 height x 3 slide): vi_height_slide
+            float32x4_t vi00, vi01, vi02, vi0n;
+            float32x4_t vi10, vi11, vi12, vi1n;
+            float32x4_t vi20, vi21, vi22, vi2n;
+            float32x4_t vi30, vi31, vi32, vi3n;
+
+            // output (1 outch x 2 height x 4 width): vo_outch_height
+            float32x4_t vo00, vo01;
+
+            // load input
+            index_t in_h = h - pad_top;
+            index_t in_w = w - pad_left;
+            index_t in_offset = in_h * in_width + in_w;
+            vi00 = vld1q_f32(in_base + in_offset);
+            vi0n = vld1q_f32(in_base + in_offset + 4);
+            vi10 = vld1q_f32(in_base + in_offset + in_width);
+            vi1n = vld1q_f32(in_base + in_offset + in_width + 4);
+            vi20 = vld1q_f32(in_base + in_offset + 2 * in_width);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4);
+            vi30 = vld1q_f32(in_base + in_offset + 3 * in_width);
+            vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4);
+
+            vi01 = vextq_f32(vi00, vi0n, 1);
+            vi02 = vextq_f32(vi00, vi0n, 2);
+            vi11 = vextq_f32(vi10, vi1n, 1);
+            vi12 = vextq_f32(vi10, vi1n, 2);
+            vi21 = vextq_f32(vi20, vi2n, 1);
+            vi22 = vextq_f32(vi20, vi2n, 2);
+            vi31 = vextq_f32(vi30, vi3n, 1);
+            vi32 = vextq_f32(vi30, vi3n, 2);
+
+            // load ouptut
+            index_t out_offset = h * out_width + w;
+            vo00 = vld1q_f32(out_base + out_offset);
+            vo01 = vld1q_f32(out_base + out_offset + out_width);
 
 #if defined(__aarch64__)
-          // outch 0, height 0
-          vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
-          vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
-          vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
-          vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
-          vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
-          vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
-          vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
-          vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
-          vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
-
-          // outch 0, height 1
-          vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
-          vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
-          vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
-          vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
-          vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
-          vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
-          vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
-          vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
-          vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
+            // outch 0, height 0
+            vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
+            vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
+            vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
+            vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
+            vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
+            vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
+            vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
+            vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
+            vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
+
+            // outch 0, height 1
+            vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
+            vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
+            vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
+            vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
+            vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
+            vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
+            vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
+            vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
+            vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
 #else
-          // outch 0, height 0
-          vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1);
-          vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1);
-          vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1);
-          vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0);
-          vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1);
-
-          // outch 0, height 1
-          vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1);
-          vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1);
-          vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1);
-          vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0);
-          vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
+            // outch 0, height 0
+            vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0);
+            vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1);
+            vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0);
+            vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0);
+            vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1);
+            vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0);
+            vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1);
+            vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0);
+            vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1);
+
+            // outch 0, height 1
+            vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0);
+            vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1);
+            vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0);
+            vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0);
+            vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1);
+            vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0);
+            vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1);
+            vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0);
+            vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
 #endif
-          vst1q_f32(out_base + out_offset, vo00);
-          vst1q_f32(out_base + out_offset + out_width, vo01);
-        }  // w
-
-        // right
-        for (; w < out_width; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
-          DepthwiseConv2dPixel(in_base, filter_ptr, h + 1, w, h + 1 - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
+            vst1q_f32(out_base + out_offset, vo00);
+            vst1q_f32(out_base + out_offset + out_width, vo01);
+          }  // w
+
+          // right
+          for (; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h + 1,
+                                 w,
+                                 h + 1 - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
+        }  // h
+
+
+        // bottom
+        for (; h < out_height; ++h) {
+          for (w = 0; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h - pad_top,
+                                 w - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
         }
-      }  // h
-
-
-      // bottom
-      for (; h < out_shape[2]; ++h) {
-        for (w = 0; w < out_shape[3]; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h - pad_top,
-                               w - pad_left, out_width, in_height, in_width, 3,
-                               3, out_base);
-        }
-      }
-    }  // m
-  }    // b
+      }  // m
+    }    // b
+  }, 0, batch, 1, 0, out_channels, 1);  // threadpool
 
   return MaceStatus::MACE_SUCCESS;
 }
@@ -256,14 +321,26 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
   const int pad_top = paddings[0] / 2;
   const int pad_left = paddings[1] / 2;
 
-  const index_t multiplier = out_shape[1] / in_shape[1];
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_channels / in_channels;
 
   std::vector<index_t> out_bounds;
   CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
+  const index_t valid_h_start = out_bounds[0];
+  const index_t valid_h_stop = out_bounds[1];
+  const index_t valid_w_start = out_bounds[2];
+  const index_t valid_w_stop = out_bounds[3];
 
   Tensor::MappingGuard in_guard(input);
   Tensor::MappingGuard filter_guard(filter);
@@ -272,131 +349,165 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
   auto input_data = input->data<float>();
   auto output_data = output->mutable_data<float>();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < in_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      index_t c = m / multiplier;
-      index_t multi_index = m % multiplier;
-      const float *in_base = input_data + b * in_batch_size + c * in_image_size;
-      const float
-          *filter_ptr = filter_data + multi_index * in_shape[1] * 9 + c * 9;
-      float *out_base = output_data + b * out_batch_size + m * out_image_size;
-      index_t h, w;
-      const index_t out_width = out_shape[3];
-      const index_t in_height = in_shape[2];
-      const index_t in_width = in_shape[3];
-
-      const index_t valid_h_start = out_bounds[0];
-      const index_t valid_h_stop = out_bounds[1];
-      const index_t valid_w_start = out_bounds[2];
-      const index_t valid_w_stop = out_bounds[3];
-
-      // top
-      for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_width; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
-                               w * 2 - pad_left, out_width, in_height, in_width,
-                               3, 3, out_base);
-        }
-      }
-
-      // load filter (1 outch x 3 height x 3 width): vf_outch_height
-      float32x4_t vf00, vf01, vf02;
-      vf00 = vld1q_f32(filter_ptr);
-      vf01 = vld1q_f32(filter_ptr + 3);
-      vf02 = vld1q_f32(filter_ptr + 5);
-
-      for (h = valid_h_start; h < valid_h_stop; ++h) {
-        // left
-        for (w = 0; w < valid_w_start; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
-                               w * 2 - pad_left, out_width, in_height, in_width,
-                               3, 3, out_base);
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t m = start1; m < end1; m += step1) {
+        index_t c = m / multiplier;
+        index_t multi_index = m % multiplier;
+        const float
+            *in_base = input_data + b * in_batch_size + c * in_image_size;
+        const float
+            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
+        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+        index_t h, w;
+
+        // top
+        for (h = 0; h < valid_h_start; ++h) {
+          for (w = 0; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h * 2 - pad_top,
+                                 w * 2 - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
         }
 
-        for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
-          float32x4x2_t vi0, vi1, vi2;
-          float32x4_t vi0n, vi1n, vi2n;
-
-          // input (3 height x 3 slide): vi_height_slide
-          float32x4_t vi00, vi01, vi02;
-          float32x4_t vi10, vi11, vi12;
-          float32x4_t vi20, vi21, vi22;
-
-          // output (1 outch x 1 height x 4 width): vo
-          float32x4_t vo;
-
-          // load input
-          index_t in_h = h * 2 - pad_top;
-          index_t in_w = w * 2 - pad_left;
-          index_t in_offset = in_h * in_width + in_w;
-          vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-          vi1 = vld2q_f32(in_base + in_offset + in_width);
-          vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
-
-          vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-          vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-          vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
-
-          // load ouptut
-          index_t out_offset = h * out_width + w;
-          vo = vld1q_f32(out_base + out_offset);
-
-          vi00 = vi0.val[0];                // [0.2.4.6]
-          vi01 = vi0.val[1];                // [1.3.5.7]
-          vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
-          vi10 = vi1.val[0];
-          vi11 = vi1.val[1];
-          vi12 = vextq_f32(vi10, vi1n, 1);
-          vi20 = vi2.val[0];
-          vi21 = vi2.val[1];
-          vi22 = vextq_f32(vi20, vi2n, 1);
+        // load filter (1 outch x 3 height x 3 width): vf_outch_height
+        float32x4_t vf00, vf01, vf02;
+        vf00 = vld1q_f32(filter_ptr);
+        vf01 = vld1q_f32(filter_ptr + 3);
+        vf02 = vld1q_f32(filter_ptr + 5);
+
+        for (h = valid_h_start; h < valid_h_stop; ++h) {
+          // left
+          for (w = 0; w < valid_w_start; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h * 2 - pad_top,
+                                 w * 2 - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
+
+          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+            float32x4x2_t vi0, vi1, vi2;
+            float32x4_t vi0n, vi1n, vi2n;
+
+            // input (3 height x 3 slide): vi_height_slide
+            float32x4_t vi00, vi01, vi02;
+            float32x4_t vi10, vi11, vi12;
+            float32x4_t vi20, vi21, vi22;
+
+            // output (1 outch x 1 height x 4 width): vo
+            float32x4_t vo;
+
+            // load input
+            index_t in_h = h * 2 - pad_top;
+            index_t in_w = w * 2 - pad_left;
+            index_t in_offset = in_h * in_width + in_w;
+            vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
+            vi1 = vld2q_f32(in_base + in_offset + in_width);
+            vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+
+            vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
+            vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+
+            // load ouptut
+            index_t out_offset = h * out_width + w;
+            vo = vld1q_f32(out_base + out_offset);
+
+            vi00 = vi0.val[0];                // [0.2.4.6]
+            vi01 = vi0.val[1];                // [1.3.5.7]
+            vi02 = vextq_f32(vi00, vi0n, 1);  // [2.4.6.8]
+            vi10 = vi1.val[0];
+            vi11 = vi1.val[1];
+            vi12 = vextq_f32(vi10, vi1n, 1);
+            vi20 = vi2.val[0];
+            vi21 = vi2.val[1];
+            vi22 = vextq_f32(vi20, vi2n, 1);
 
 #if defined(__aarch64__)
-          // outch 0, height 0
-          vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
-          vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
-          vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
-          vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
-          vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
-          vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
-          vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
-          vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
-          vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
+            // outch 0, height 0
+            vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
+            vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
+            vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
+            vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
+            vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
+            vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
+            vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
+            vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
+            vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
 #else
-          // outch 0, height 0
-          vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0);
-          vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1);
-          vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0);
-          vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0);
-          vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1);
-          vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0);
-          vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1);
-          vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0);
-          vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1);
+            // outch 0, height 0
+            vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0);
+            vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1);
+            vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0);
+            vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0);
+            vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1);
+            vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0);
+            vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1);
+            vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0);
+            vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1);
 #endif
-          vst1q_f32(out_base + out_offset, vo);
-        }  // w
-
-        // right
-        for (; w < out_width; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
-                               w * 2 - pad_left, out_width, in_height, in_width,
-                               3, 3, out_base);
+            vst1q_f32(out_base + out_offset, vo);
+          }  // w
+
+          // right
+          for (; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h * 2 - pad_top,
+                                 w * 2 - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
+        }  // h
+
+
+        // bottom
+        for (; h < out_height; ++h) {
+          for (w = 0; w < out_width; ++w) {
+            DepthwiseConv2dPixel(in_base,
+                                 filter_ptr,
+                                 h,
+                                 w,
+                                 h * 2 - pad_top,
+                                 w * 2 - pad_left,
+                                 out_width,
+                                 in_height,
+                                 in_width,
+                                 3,
+                                 3,
+                                 out_base);
+          }
         }
-      }  // h
-
-
-      // bottom
-      for (; h < out_shape[2]; ++h) {
-        for (w = 0; w < out_shape[3]; ++w) {
-          DepthwiseConv2dPixel(in_base, filter_ptr, h, w, h * 2 - pad_top,
-                               w * 2 - pad_left, out_width, in_height, in_width,
-                               3, 3, out_base);
-        }
-      }
-    }  // m
-  }    // b
+      }  // m
+    }    // b
+  }, 0, batch, 1, 0, out_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
index e89ae2b969cdbdb7b32f34eeeada62d3ec14af3b..c130fbffd361dfb33be9974b3d603e630cb80979 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
@@ -28,7 +28,7 @@ namespace fp32 {
 
 class DepthwiseConv2dK3x3S1 : public Conv2dBase {
  public:
-  DepthwiseConv2dK3x3S1(const std::vector<int> paddings,
+  DepthwiseConv2dK3x3S1(const std::vector<int> &paddings,
                         const Padding padding_type)
       : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
   virtual ~DepthwiseConv2dK3x3S1() {}
@@ -37,12 +37,12 @@ class DepthwiseConv2dK3x3S1 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 class DepthwiseConv2dK3x3S2 : public Conv2dBase {
  public:
-  DepthwiseConv2dK3x3S2(const std::vector<int> paddings,
+  DepthwiseConv2dK3x3S2(const std::vector<int> &paddings,
                         const Padding padding_type)
       : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
   virtual ~DepthwiseConv2dK3x3S2() {}
@@ -51,7 +51,7 @@ class DepthwiseConv2dK3x3S2 : public Conv2dBase {
       const OpContext *context,
       const Tensor *input,
       const Tensor *filter,
-      Tensor *output);
+      Tensor *output) override;
 };
 
 }  // namespace fp32
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3cd6d527b7f1fa67d053cc96dea8ae6505e32352
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
@@ -0,0 +1,782 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
+
+#include <arm_neon.h>
+#include "mace/ops/arm/fp32/common_neon.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  group_ = input->dim(1);
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = b * channels + c;
+        float *out_base = padded_out_data + offset * out_img_size;
+        const float *input_base = input_data + offset * in_img_size;
+        const float *kernel_base = filter_data + c * 9;
+        const float *in = input_base;
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 3;
+        const float *k2 = kernel_base + 5;
+
+        // load filter
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+
+        for (index_t i = 0; i < h; ++i) {
+          float *out_row_base = out_base + i * outw;
+          float *out_row0 = out_row_base;
+          float *out_row1 = out_row_base + outw;
+          float *out_row2 = out_row_base + 2 * outw;
+          index_t j = 0;
+
+          for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            float32x4_t out00, out01, out02;
+            float32x4_t out10, out11, out12;
+            float32x4_t out20, out21, out22;
+
+            out00 = vld1q_f32(out_row0 + 0);
+            out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+            vst1q_f32(out_row0 + 0, out00);
+
+            out01 = vld1q_f32(out_row0 + 1);
+            out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+            vst1q_f32(out_row0 + 1, out01);
+
+            out02 = vld1q_f32(out_row0 + 2);
+            out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+            vst1q_f32(out_row0 + 2, out02);
+
+            out10 = vld1q_f32(out_row1 + 0);
+            out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+            vst1q_f32(out_row1 + 0, out10);
+
+            out11 = vld1q_f32(out_row1 + 1);
+            out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+            vst1q_f32(out_row1 + 1, out11);
+
+            out12 = vld1q_f32(out_row1 + 2);
+            out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+            vst1q_f32(out_row1 + 2, out12);
+
+            out20 = vld1q_f32(out_row2 + 0);
+            out20 = neon_vfma_lane_1(out20, in_vec, k2_vec);
+            vst1q_f32(out_row2 + 0, out20);
+
+            out21 = vld1q_f32(out_row2 + 1);
+            out21 = neon_vfma_lane_2(out21, in_vec, k2_vec);
+            vst1q_f32(out_row2 + 1, out21);
+
+            out22 = vld1q_f32(out_row2 + 2);
+            out22 = neon_vfma_lane_3(out22, in_vec, k2_vec);
+            vst1q_f32(out_row2 + 2, out22);
+
+            in += 4;
+            out_row0 += 4;
+            out_row1 += 4;
+            out_row2 += 4;
+          }
+
+          for (; j < w; ++j) {
+            float val = in[0];
+            for (int k = 0; k < 3; ++k) {
+              out_row0[k] += val * k0[k];
+              out_row1[k] += val * k1[k];
+              out_row2[k] += val * k2[k + 1];
+            }
+            in++;
+            out_row0++;
+            out_row1++;
+            out_row2++;
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  group_ = input->dim(1);
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = b * channels + c;
+        float *out_base = padded_out_data + offset * out_img_size;
+        const float *input_base = input_data + offset * in_img_size;
+        const float *kernel_base = filter_data + c * 9;
+        const float *in = input_base;
+
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 3;
+        const float *k2 = kernel_base + 5;
+
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+
+        for (index_t i = 0; i < h; ++i) {
+          float *out_row_base = out_base + i * 2 * outw;
+          float *out_row_0 = out_row_base;
+          float *out_row_1 = out_row_0 + outw;
+          float *out_row_2 = out_row_1 + outw;
+
+          index_t j = 0;
+
+          for (index_t n = 0; n + 9 < outw; n += 8) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // out row 0
+            float32x4x2_t out00 = vld2q_f32(out_row_0);
+            out00.val[0] =
+                neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+            out00.val[1] =
+                neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out00);
+
+            float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+            out01.val[0] =
+                neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out01);
+
+            // out row 1
+            float32x4x2_t out10 = vld2q_f32(out_row_1);
+            out10.val[0] =
+                neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+            out10.val[1] =
+                neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out10);
+
+            float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+            out11.val[0] =
+                neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out11);
+
+            // out row 2
+            float32x4x2_t out20 = vld2q_f32(out_row_2);
+            out20.val[0] =
+                neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+            out20.val[1] =
+                neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out20);
+
+            float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+            out21.val[0] =
+                neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out21);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+            j += 4;
+          }
+
+          for (; j < w; ++j) {
+            float val = in[0];
+
+            for (int k = 0; k < 3; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k1[k];
+              out_row_2[k] += val * k2[k + 1];
+            }
+
+            in++;
+            out_row_0 += 2;
+            out_row_1 += 2;
+            out_row_2 += 2;
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
+                                        const Tensor *input,
+                                        const Tensor *filter,
+                                        const Tensor *output_shape,
+                                        Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1,
+                            index_t start2, index_t end2, index_t step2) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t g = start1; g < end1; g += step1) {
+        for (index_t oc = start2; oc < end2; oc += step2) {
+          if (oc + 1 < outch_g) {
+            const index_t out_offset = b * outch + outch_g * g + oc;
+            float *out_base0 = padded_out_data + out_offset * out_img_size;
+            float *out_base1 = out_base0 + out_img_size;
+            for (index_t ic = 0; ic < inch_g; ++ic) {
+              const index_t in_offset = b * inch + inch_g * g + ic;
+              const float *input_base = input_data + in_offset * in_img_size;
+              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+              const float *kernel_base0 = filter_data + kernel_offset * 9;
+              const float *kernel_base1 = kernel_base0 + inch * 9;
+              const float *in = input_base;
+
+              // output channel 0
+              const float *k0_0 = kernel_base0;
+              const float *k0_1 = kernel_base0 + 3;
+              const float *k0_2 = kernel_base0 + 5;
+              // output channel 1
+              const float *k1_0 = kernel_base1;
+              const float *k1_1 = kernel_base1 + 3;
+              const float *k1_2 = kernel_base1 + 5;
+
+              // load filter
+              float32x4_t k00_vec, k01_vec, k02_vec;
+              float32x4_t k10_vec, k11_vec, k12_vec;
+
+              k00_vec = vld1q_f32(k0_0);
+              k01_vec = vld1q_f32(k0_1);
+              k02_vec = vld1q_f32(k0_2);
+
+              k10_vec = vld1q_f32(k1_0);
+              k11_vec = vld1q_f32(k1_1);
+              k12_vec = vld1q_f32(k1_2);
+
+              for (index_t i = 0; i < h; ++i) {
+                float *out_row_base0 = out_base0 + i * outw;
+                float *out_row0_0 = out_row_base0;
+                float *out_row0_1 = out_row_base0 + outw;
+                float *out_row0_2 = out_row_base0 + 2 * outw;
+
+                float *out_row_base1 = out_base1 + i * outw;
+                float *out_row1_0 = out_row_base1;
+                float *out_row1_1 = out_row_base1 + outw;
+                float *out_row1_2 = out_row_base1 + 2 * outw;
+
+                index_t j = 0;
+
+                for (; j + 3 < w; j += 4) {
+                  float32x4_t in_vec = vld1q_f32(in);
+
+                  float32x4_t out00, out01, out02;
+                  float32x4_t out10, out11, out12;
+                  float32x4_t out20, out21, out22;
+
+                  out00 = vld1q_f32(out_row0_0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0, out00);
+
+                  out01 = vld1q_f32(out_row0_0 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0 + 1, out01);
+
+                  out02 = vld1q_f32(out_row0_0 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0 + 2, out02);
+
+                  out10 = vld1q_f32(out_row0_1 + 0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 0, out10);
+
+                  out11 = vld1q_f32(out_row0_1 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 1, out11);
+
+                  out12 = vld1q_f32(out_row0_1 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 2, out12);
+
+                  out20 = vld1q_f32(out_row0_2 + 0);
+                  out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 0, out20);
+
+                  out21 = vld1q_f32(out_row0_2 + 1);
+                  out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 1, out21);
+
+                  out22 = vld1q_f32(out_row0_2 + 2);
+                  out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 2, out22);
+
+                  out00 = vld1q_f32(out_row1_0 + 0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 0, out00);
+
+                  out01 = vld1q_f32(out_row1_0 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 1, out01);
+
+                  out02 = vld1q_f32(out_row1_0 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 2, out02);
+
+                  out10 = vld1q_f32(out_row1_1 + 0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 0, out10);
+
+                  out11 = vld1q_f32(out_row1_1 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 1, out11);
+
+                  out12 = vld1q_f32(out_row1_1 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 2, out12);
+
+                  out20 = vld1q_f32(out_row1_2 + 0);
+                  out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 0, out20);
+
+                  out21 = vld1q_f32(out_row1_2 + 1);
+                  out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 1, out21);
+
+                  out22 = vld1q_f32(out_row1_2 + 2);
+                  out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 2, out22);
+
+                  in += 4;
+                  out_row0_0 += 4;
+                  out_row0_1 += 4;
+                  out_row0_2 += 4;
+                  out_row1_0 += 4;
+                  out_row1_1 += 4;
+                  out_row1_2 += 4;
+                }
+
+                for (; j < w; ++j) {
+                  float val = in[0];
+                  for (int k = 0; k < 3; ++k) {
+                    out_row0_0[k] += val * k0_0[k];
+                    out_row0_1[k] += val * k0_1[k];
+                    out_row0_2[k] += val * k0_2[k + 1];
+                    out_row1_0[k] += val * k1_0[k];
+                    out_row1_1[k] += val * k1_1[k];
+                    out_row1_2[k] += val * k1_2[k + 1];
+                  }
+                  in++;
+                  out_row0_0++;
+                  out_row0_1++;
+                  out_row0_2++;
+                  out_row1_0++;
+                  out_row1_1++;
+                  out_row1_2++;
+                }
+              }
+            }
+          } else {
+            const index_t out_offset = b * outch + outch_g * g + oc;
+            float *out_base0 = padded_out_data + out_offset * out_img_size;
+            for (index_t ic = 0; ic < inch_g; ++ic) {
+              const index_t in_offset = (b * group_ + g) * inch_g + ic;
+              const float *input_base = input_data + in_offset * in_img_size;
+              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+              const float *kernel_base0 = filter_data + kernel_offset * 9;
+              const float *in = input_base;
+              const float *k0_0 = kernel_base0;
+              const float *k0_1 = kernel_base0 + 3;
+              const float *k0_2 = kernel_base0 + 5;
+
+              // load filter
+              float32x4_t k00_vec = vld1q_f32(k0_0);
+              float32x4_t k01_vec = vld1q_f32(k0_1);
+              float32x4_t k02_vec = vld1q_f32(k0_2);
+
+              for (index_t i = 0; i < h; ++i) {
+                float *out_row_base0 = out_base0 + i * outw;
+                float *out_row0_0 = out_row_base0;
+                float *out_row0_1 = out_row_base0 + outw;
+                float *out_row0_2 = out_row_base0 + 2 * outw;
+                index_t j = 0;
+
+                for (; j + 3 < w; j += 4) {
+                  float32x4_t in_vec = vld1q_f32(in);
+
+                  float32x4_t out00, out01, out02;
+                  float32x4_t out10, out11, out12;
+                  float32x4_t out20, out21, out22;
+
+                  out00 = vld1q_f32(out_row0_0 + 0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0 + 0, out00);
+
+                  out01 = vld1q_f32(out_row0_0 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0 + 1, out01);
+
+                  out02 = vld1q_f32(out_row0_0 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
+                  vst1q_f32(out_row0_0 + 2, out02);
+
+                  out10 = vld1q_f32(out_row0_1 + 0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 0, out10);
+
+                  out11 = vld1q_f32(out_row0_1 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 1, out11);
+
+                  out12 = vld1q_f32(out_row0_1 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
+                  vst1q_f32(out_row0_1 + 2, out12);
+
+                  out20 = vld1q_f32(out_row0_2 + 0);
+                  out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 0, out20);
+
+                  out21 = vld1q_f32(out_row0_2 + 1);
+                  out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 1, out21);
+
+                  out22 = vld1q_f32(out_row0_2 + 2);
+                  out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
+                  vst1q_f32(out_row0_2 + 2, out22);
+
+                  in += 4;
+                  out_row0_0 += 4;
+                  out_row0_1 += 4;
+                  out_row0_2 += 4;
+                }
+
+                for (; j < w; ++j) {
+                  float val = in[0];
+                  for (int k = 0; k < 3; ++k) {
+                    out_row0_0[k] += val * k0_0[k];
+                    out_row0_1[k] += val * k0_1[k];
+                    out_row0_2[k] += val * k0_2[k + 1];
+                  }
+                  in++;
+                  out_row0_0++;
+                  out_row0_1++;
+                  out_row0_2++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
+                                        const Tensor *input,
+                                        const Tensor *filter,
+                                        const Tensor *output_shape,
+                                        Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1,
+                            index_t start2, index_t end2, index_t step2) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t g = start1; g < end1; g += step1) {
+        for (index_t oc = start2; oc < end2; oc += step2) {
+          const index_t out_offset = b * outch + outch_g * g + oc;
+          float *out_base = padded_out_data + out_offset * out_img_size;
+          for (index_t ic = 0; ic < inch_g; ++ic) {
+            const index_t in_offset = b * inch + inch_g * g + ic;
+            const float *input_base = input_data + in_offset * in_img_size;
+            const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+            const float *kernel_base = filter_data + kernel_offset * 9;
+            const float *in = input_base;
+
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 3;
+            const float *k2 = kernel_base + 5;
+
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+
+            for (index_t i = 0; i < h; ++i) {
+              float *out_row_base = out_base + i * 2 * outw;
+              float *out_row_0 = out_row_base;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+
+              index_t j = 0;
+
+              for (index_t n = 0; n + 9 < outw; n += 8) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                // out row 0
+                float32x4x2_t out00 = vld2q_f32(out_row_0);
+                out00.val[0] =
+                    neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
+                out00.val[1] =
+                    neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
+                vst2q_f32(out_row_0, out00);
+
+                float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
+                out01.val[0] =
+                    neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
+                vst2q_f32(out_row_0 + 2, out01);
+
+                // out row 1
+                float32x4x2_t out10 = vld2q_f32(out_row_1);
+                out10.val[0] =
+                    neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
+                out10.val[1] =
+                    neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
+                vst2q_f32(out_row_1, out10);
+
+                float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
+                out11.val[0] =
+                    neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
+                vst2q_f32(out_row_1 + 2, out11);
+
+                // out row 2
+                float32x4x2_t out20 = vld2q_f32(out_row_2);
+                out20.val[0] =
+                    neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
+                out20.val[1] =
+                    neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
+                vst2q_f32(out_row_2, out20);
+
+                float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
+                out21.val[0] =
+                    neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
+                vst2q_f32(out_row_2 + 2, out21);
+
+                in += 4;
+                out_row_0 += 8;
+                out_row_1 += 8;
+                out_row_2 += 8;
+                j += 4;
+              }
+
+              for (; j < w; ++j) {
+                float val = in[0];
+
+                for (int k = 0; k < 3; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k + 1];
+                }
+
+                in++;
+                out_row_0 += 2;
+                out_row_1 += 2;
+                out_row_2 += 2;
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dd315a47ad5e0c9a815b64ca3c5c0de63faf25e
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
@@ -0,0 +1,122 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
+ public:
+  DepthwiseDeconv2dK3x3S1(const std::vector<int> &paddings,
+                          const Padding padding_type,
+                          const FrameworkType framework_type)
+      : Deconv2dBase({1, 1},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~DepthwiseDeconv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
+ public:
+  DepthwiseDeconv2dK3x3S2(const std::vector<int> &paddings,
+                          const Padding padding_type,
+                          const FrameworkType framework_type)
+      : Deconv2dBase({2, 2},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~DepthwiseDeconv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class GroupDeconv2dK3x3S1 : public Deconv2dBase {
+ public:
+  GroupDeconv2dK3x3S1(const std::vector<int> &paddings,
+                      const Padding padding_type,
+                      const int group,
+                      const FrameworkType framework_type)
+      : Deconv2dBase({1, 1},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     group,
+                     framework_type) {}
+  virtual ~GroupDeconv2dK3x3S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class GroupDeconv2dK3x3S2 : public Deconv2dBase {
+ public:
+  GroupDeconv2dK3x3S2(const std::vector<int> &paddings,
+                      const Padding padding_type,
+                      const int group,
+                      const FrameworkType framework_type)
+      : Deconv2dBase({2, 2},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     group,
+                     framework_type) {}
+  virtual ~GroupDeconv2dK3x3S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85c93b0cef7b53dc170d48eeaa6c65154f85c8e8
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
@@ -0,0 +1,966 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
+
+#include <arm_neon.h>
+#include "mace/ops/arm/fp32/common_neon.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  group_ = input->dim(1);
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = b * channels + c;
+        float *out_base = padded_out_data + offset * out_img_size;
+        const float *input_base = input_data + offset * in_img_size;
+        const float *kernel_base = filter_data + c * 16;
+        const float *in = input_base;
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 4;
+        const float *k2 = kernel_base + 8;
+        const float *k3 = kernel_base + 12;
+
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+        float32x4_t k3_vec = vld1q_f32(k3);
+
+        for (index_t i = 0; i < h; i++) {
+          float *out_row = out_base + i * outw;
+          float *out_row_0 = out_row;
+          float *out_row_1 = out_row_0 + outw;
+          float *out_row_2 = out_row_1 + outw;
+          float *out_row_3 = out_row_2 + outw;
+          index_t j = 0;
+
+          for (; j + 3 < w; j += 4) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            float32x4_t out00 = vld1q_f32(out_row_0);
+            out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+            vst1q_f32(out_row_0, out00);
+
+            float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+            out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+            vst1q_f32(out_row_0 + 1, out01);
+
+            float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+            out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+            vst1q_f32(out_row_0 + 2, out02);
+
+            float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+            out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+            vst1q_f32(out_row_0 + 3, out03);
+
+            //
+            float32x4_t out10 = vld1q_f32(out_row_1);
+            out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+            vst1q_f32(out_row_1, out10);
+
+            float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+            out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+            vst1q_f32(out_row_1 + 1, out11);
+
+            float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+            out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+            vst1q_f32(out_row_1 + 2, out12);
+
+            float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+            out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+            vst1q_f32(out_row_1 + 3, out13);
+
+            //
+            float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+            out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+            vst1q_f32(out_row_2 + 0, out20);
+
+            float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+            out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+            vst1q_f32(out_row_2 + 1, out21);
+
+            float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+            out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+            vst1q_f32(out_row_2 + 2, out22);
+
+            float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+            out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+            vst1q_f32(out_row_2 + 3, out23);
+
+            //
+            float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+            out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+            vst1q_f32(out_row_3 + 0, out30);
+
+            float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+            out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+            vst1q_f32(out_row_3 + 1, out31);
+
+            float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+            out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+            vst1q_f32(out_row_3 + 2, out32);
+
+            float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+            out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+            vst1q_f32(out_row_3 + 3, out33);
+
+            in += 4;
+            out_row_0 += 4;
+            out_row_1 += 4;
+            out_row_2 += 4;
+            out_row_3 += 4;
+          }
+
+          for (; j < w; j++) {
+            float val = in[0];
+            for (int k = 0; k < 4; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k1[k];
+              out_row_2[k] += val * k2[k];
+              out_row_3[k] += val * k3[k];
+            }
+            in++;
+            out_row_0++;
+            out_row_1++;
+            out_row_2++;
+            out_row_3++;
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  group_ = input->dim(1);
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = b * channels + c;
+        float *out_base = padded_out_data + offset * out_img_size;
+        const float *input_base = input_data + offset * in_img_size;
+        const float *kernel_base = filter_data + c * 16;
+        const float *in = input_base;
+
+        const float *k0 = kernel_base;
+        const float *k1 = kernel_base + 4;
+        const float *k2 = kernel_base + 8;
+        const float *k3 = kernel_base + 12;
+
+        float32x4_t k0_vec = vld1q_f32(k0);
+        float32x4_t k1_vec = vld1q_f32(k1);
+        float32x4_t k2_vec = vld1q_f32(k2);
+        float32x4_t k3_vec = vld1q_f32(k3);
+
+        for (index_t i = 0; i < h; i++) {
+          float *out_row = out_base + 2 * i * outw;
+
+          float *out_row_0 = out_row;
+          float *out_row_1 = out_row_0 + outw;
+          float *out_row_2 = out_row_1 + outw;
+          float *out_row_3 = out_row_2 + outw;
+
+          index_t j = 0;
+
+          for (index_t n = 0; n + 9 < outw; n += 8) {
+            float32x4_t in_vec = vld1q_f32(in);
+
+            // row 0
+            float32x4x2_t out0 = vld2q_f32(out_row_0);
+            out0.val[0] =
+                neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+                neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0, out0);
+            out0 = vld2q_f32(out_row_0 + 2);
+            out0.val[0] =
+                neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+            out0.val[1] =
+                neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+            vst2q_f32(out_row_0 + 2, out0);
+
+            // row 1
+            float32x4x2_t out1 = vld2q_f32(out_row_1);
+            out1.val[0] =
+                neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+                neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1, out1);
+            out1 = vld2q_f32(out_row_1 + 2);
+            out1.val[0] =
+                neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+            out1.val[1] =
+                neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+            vst2q_f32(out_row_1 + 2, out1);
+
+            // row 2
+            float32x4x2_t out2 = vld2q_f32(out_row_2);
+            out2.val[0] =
+                neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+                neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2, out2);
+            out2 = vld2q_f32(out_row_2 + 2);
+            out2.val[0] =
+                neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+            out2.val[1] =
+                neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+            vst2q_f32(out_row_2 + 2, out2);
+
+            // row 3
+            float32x4x2_t out3 = vld2q_f32(out_row_3);
+            out3.val[0] =
+                neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+                neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3, out3);
+            out3 = vld2q_f32(out_row_3 + 2);
+            out3.val[0] =
+                neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+            out3.val[1] =
+                neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+            vst2q_f32(out_row_3 + 2, out3);
+
+            in += 4;
+            out_row_0 += 8;
+            out_row_1 += 8;
+            out_row_2 += 8;
+            out_row_3 += 8;
+            j += 4;
+          }
+
+          for (; j < w; j++) {
+            float val = in[0];
+            for (int k = 0; k < 4; ++k) {
+              out_row_0[k] += val * k0[k];
+              out_row_1[k] += val * k1[k];
+              out_row_2[k] += val * k2[k];
+              out_row_3[k] += val * k3[k];
+            }
+            in++;
+            out_row_0 += 2;
+            out_row_1 += 2;
+            out_row_2 += 2;
+            out_row_3 += 2;
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
+                                        const Tensor *input,
+                                        const Tensor *filter,
+                                        const Tensor *output_shape,
+                                        Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1,
+                            index_t start2, index_t end2, index_t step2) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t g = start1; g < end1; g += step1) {
+        for (index_t oc = start2; oc < end2; oc += step2) {
+          if (oc + 1 < outch_g) {
+            const index_t out_offset =
+                (b * outch + outch_g * g + oc) * out_img_size;
+            float *out_base = padded_out_data + out_offset;
+            float *out_base1 = out_base + out_img_size;
+            for (index_t ic = 0; ic < inch_g; ic++) {
+              const index_t in_offset =
+                  (b * inch + inch_g * g + ic) * in_img_size;
+              const float *input_base = input_data + in_offset;
+              const float *in = input_base;
+              const index_t kernel_offset =
+                  ((oc * group_ + g) * inch_g + ic) * 16;
+              const float *kernel_base = filter_data + kernel_offset;
+              const float *k0 = kernel_base;
+              const float *k1 = kernel_base + 4;
+              const float *k2 = kernel_base + 8;
+              const float *k3 = kernel_base + 12;
+
+              const float *kernel_base1 = kernel_base + inch * 16;
+              const float *k10 = kernel_base1;
+              const float *k11 = kernel_base1 + 4;
+              const float *k12 = kernel_base1 + 8;
+              const float *k13 = kernel_base1 + 12;
+
+              float32x4_t k0_vec = vld1q_f32(k0);
+              float32x4_t k1_vec = vld1q_f32(k1);
+              float32x4_t k2_vec = vld1q_f32(k2);
+              float32x4_t k3_vec = vld1q_f32(k3);
+
+              float32x4_t k10_vec = vld1q_f32(k10);
+              float32x4_t k11_vec = vld1q_f32(k11);
+              float32x4_t k12_vec = vld1q_f32(k12);
+              float32x4_t k13_vec = vld1q_f32(k13);
+
+              for (index_t i = 0; i < h; i++) {
+                float *out_row = out_base + i * outw;
+
+                float *out_row_0 = out_row;
+                float *out_row_1 = out_row_0 + outw;
+                float *out_row_2 = out_row_1 + outw;
+                float *out_row_3 = out_row_2 + outw;
+
+                float *out_row1 = out_base1 + i * outw;
+
+                float *out_row1_0 = out_row1;
+                float *out_row1_1 = out_row1_0 + outw;
+                float *out_row1_2 = out_row1_1 + outw;
+                float *out_row1_3 = out_row1_2 + outw;
+
+                index_t j = 0;
+
+                for (; j + 3 < w; j += 4) {
+                  float32x4_t in_vec = vld1q_f32(in);
+                  float32x4_t out00, out01, out02, out03;
+                  float32x4_t out10, out11, out12, out13;
+
+                  out00 = vld1q_f32(out_row_0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                  vst1q_f32(out_row_0, out00);
+
+                  out10 = vld1q_f32(out_row1_0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0, out10);
+
+                  out01 = vld1q_f32(out_row_0 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 1, out01);
+
+                  out11 = vld1q_f32(out_row1_0 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 1, out11);
+
+                  out02 = vld1q_f32(out_row_0 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 2, out02);
+
+                  out12 = vld1q_f32(out_row1_0 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 2, out12);
+
+                  out03 = vld1q_f32(out_row_0 + 3);
+                  out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 3, out03);
+
+                  out13 = vld1q_f32(out_row1_0 + 3);
+                  out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
+                  vst1q_f32(out_row1_0 + 3, out13);
+
+                  //
+                  out00 = vld1q_f32(out_row_1);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
+                  vst1q_f32(out_row_1, out00);
+
+                  out10 = vld1q_f32(out_row1_1);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1, out10);
+
+                  out01 = vld1q_f32(out_row_1 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 1, out01);
+
+                  out11 = vld1q_f32(out_row1_1 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 1, out11);
+
+                  out02 = vld1q_f32(out_row_1 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 2, out02);
+
+                  out12 = vld1q_f32(out_row1_1 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 2, out12);
+
+                  out03 = vld1q_f32(out_row_1 + 3);
+                  out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 3, out03);
+
+                  out13 = vld1q_f32(out_row1_1 + 3);
+                  out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
+                  vst1q_f32(out_row1_1 + 3, out13);
+
+                  //
+                  out00 = vld1q_f32(out_row_2 + 0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 0, out00);
+
+                  out10 = vld1q_f32(out_row1_2 + 0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 0, out10);
+
+                  out01 = vld1q_f32(out_row_2 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 1, out01);
+
+                  out11 = vld1q_f32(out_row1_2 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 1, out11);
+
+                  out02 = vld1q_f32(out_row_2 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 2, out02);
+
+                  out12 = vld1q_f32(out_row1_2 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 2, out12);
+
+                  out03 = vld1q_f32(out_row_2 + 3);
+                  out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 3, out03);
+
+                  out13 = vld1q_f32(out_row1_2 + 3);
+                  out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
+                  vst1q_f32(out_row1_2 + 3, out13);
+
+                  //
+                  out00 = vld1q_f32(out_row_3 + 0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 0, out00);
+
+                  out10 = vld1q_f32(out_row1_3 + 0);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
+                  vst1q_f32(out_row1_3 + 0, out10);
+
+                  out01 = vld1q_f32(out_row_3 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 1, out01);
+
+                  out11 = vld1q_f32(out_row1_3 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
+                  vst1q_f32(out_row1_3 + 1, out11);
+
+                  out02 = vld1q_f32(out_row_3 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 2, out02);
+
+                  out12 = vld1q_f32(out_row1_3 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
+                  vst1q_f32(out_row1_3 + 2, out12);
+
+                  out03 = vld1q_f32(out_row_3 + 3);
+                  out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 3, out03);
+
+                  out13 = vld1q_f32(out_row1_3 + 3);
+                  out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
+                  vst1q_f32(out_row1_3 + 3, out13);
+
+                  in += 4;
+                  out_row_0 += 4;
+                  out_row_1 += 4;
+                  out_row_2 += 4;
+                  out_row_3 += 4;
+                  out_row1_0 += 4;
+                  out_row1_1 += 4;
+                  out_row1_2 += 4;
+                  out_row1_3 += 4;
+                }
+
+                for (; j < w; j++) {
+                  float val = in[0];
+                  for (int k = 0; k < 4; ++k) {
+                    out_row_0[k] += val * k0[k];
+                    out_row_1[k] += val * k1[k];
+                    out_row_2[k] += val * k2[k];
+                    out_row_3[k] += val * k3[k];
+                    out_row1_0[k] += val * k10[k];
+                    out_row1_1[k] += val * k11[k];
+                    out_row1_2[k] += val * k12[k];
+                    out_row1_3[k] += val * k13[k];
+                  }
+                  in++;
+                  out_row_0++;
+                  out_row_1++;
+                  out_row_2++;
+                  out_row_3++;
+                  out_row1_0++;
+                  out_row1_1++;
+                  out_row1_2++;
+                  out_row1_3++;
+                }
+              }
+            }
+          } else {
+            const index_t out_offset =
+                (b * outch + outch_g * g + oc) * out_img_size;
+            float *out_base = padded_out_data + out_offset;
+            for (index_t ic = 0; ic < inch_g; ++ic) {
+              const index_t in_offset =
+                  (b * inch + inch_g * g + ic) * in_img_size;
+              const index_t kernel_offset =
+                  ((oc * group_ + g) * inch_g + ic) * 16;
+
+              const float *input_base = input_data + in_offset;
+              const float *kernel_base = filter_data + kernel_offset;
+              const float *in = input_base;
+              const float *k0 = kernel_base;
+              const float *k1 = kernel_base + 4;
+              const float *k2 = kernel_base + 8;
+              const float *k3 = kernel_base + 12;
+
+              float32x4_t k0_vec = vld1q_f32(k0);
+              float32x4_t k1_vec = vld1q_f32(k1);
+              float32x4_t k2_vec = vld1q_f32(k2);
+              float32x4_t k3_vec = vld1q_f32(k3);
+
+              for (index_t i = 0; i < h; i++) {
+                float *out_row = out_base + i * outw;
+                float *out_row_0 = out_row;
+                float *out_row_1 = out_row_0 + outw;
+                float *out_row_2 = out_row_1 + outw;
+                float *out_row_3 = out_row_2 + outw;
+                index_t j = 0;
+
+                for (; j + 3 < w; j += 4) {
+                  float32x4_t in_vec = vld1q_f32(in);
+
+                  float32x4_t out00 = vld1q_f32(out_row_0);
+                  out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
+                  vst1q_f32(out_row_0, out00);
+
+                  float32x4_t out01 = vld1q_f32(out_row_0 + 1);
+                  out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 1, out01);
+
+                  float32x4_t out02 = vld1q_f32(out_row_0 + 2);
+                  out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 2, out02);
+
+                  float32x4_t out03 = vld1q_f32(out_row_0 + 3);
+                  out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
+                  vst1q_f32(out_row_0 + 3, out03);
+
+                  //
+                  float32x4_t out10 = vld1q_f32(out_row_1);
+                  out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
+                  vst1q_f32(out_row_1, out10);
+
+                  float32x4_t out11 = vld1q_f32(out_row_1 + 1);
+                  out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 1, out11);
+
+                  float32x4_t out12 = vld1q_f32(out_row_1 + 2);
+                  out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 2, out12);
+
+                  float32x4_t out13 = vld1q_f32(out_row_1 + 3);
+                  out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
+                  vst1q_f32(out_row_1 + 3, out13);
+
+                  //
+                  float32x4_t out20 = vld1q_f32(out_row_2 + 0);
+                  out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 0, out20);
+
+                  float32x4_t out21 = vld1q_f32(out_row_2 + 1);
+                  out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 1, out21);
+
+                  float32x4_t out22 = vld1q_f32(out_row_2 + 2);
+                  out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 2, out22);
+
+                  float32x4_t out23 = vld1q_f32(out_row_2 + 3);
+                  out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
+                  vst1q_f32(out_row_2 + 3, out23);
+
+                  //
+                  float32x4_t out30 = vld1q_f32(out_row_3 + 0);
+                  out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 0, out30);
+
+                  float32x4_t out31 = vld1q_f32(out_row_3 + 1);
+                  out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 1, out31);
+
+                  float32x4_t out32 = vld1q_f32(out_row_3 + 2);
+                  out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 2, out32);
+
+                  float32x4_t out33 = vld1q_f32(out_row_3 + 3);
+                  out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
+                  vst1q_f32(out_row_3 + 3, out33);
+
+                  in += 4;
+                  out_row_0 += 4;
+                  out_row_1 += 4;
+                  out_row_2 += 4;
+                  out_row_3 += 4;
+                }
+
+                for (; j < w; j++) {
+                  float val = in[0];
+                  for (int k = 0; k < 4; ++k) {
+                    out_row_0[k] += val * k0[k];
+                    out_row_1[k] += val * k1[k];
+                    out_row_2[k] += val * k2[k];
+                    out_row_3[k] += val * k3[k];
+                  }
+                  in++;
+                  out_row_0++;
+                  out_row_1++;
+                  out_row_2++;
+                  out_row_3++;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
+                                        const Tensor *input,
+                                        const Tensor *filter,
+                                        const Tensor *output_shape,
+                                        Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1,
+                            index_t start2, index_t end2, index_t step2) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t g = start1; g < end1; g += step1) {
+        for (index_t oc = start2; oc < end2; oc += step2) {
+          const index_t out_offset =
+              (b * outch + outch_g * g + oc) * out_img_size;
+          float *out_base = padded_out_data + out_offset;
+          for (index_t ic = 0; ic < inch_g; ic++) {
+            const index_t in_offset =
+                (b * inch + inch_g * g + ic) * in_img_size;
+            const index_t kernel_offset =
+                ((oc * group_ + g) * inch_g + ic) * 16;
+            const float *input_base = input_data + in_offset;
+            const float *kernel_base = filter_data + kernel_offset;
+            const float *in = input_base;
+
+            const float *k0 = kernel_base;
+            const float *k1 = kernel_base + 4;
+            const float *k2 = kernel_base + 8;
+            const float *k3 = kernel_base + 12;
+
+            float32x4_t k0_vec = vld1q_f32(k0);
+            float32x4_t k1_vec = vld1q_f32(k1);
+            float32x4_t k2_vec = vld1q_f32(k2);
+            float32x4_t k3_vec = vld1q_f32(k3);
+
+            for (index_t i = 0; i < h; i++) {
+              float *out_row = out_base + 2 * i * outw;
+
+              float *out_row_0 = out_row;
+              float *out_row_1 = out_row_0 + outw;
+              float *out_row_2 = out_row_1 + outw;
+              float *out_row_3 = out_row_2 + outw;
+
+              index_t j = 0;
+
+              for (index_t n = 0; n + 9 < outw; n += 8) {
+                float32x4_t in_vec = vld1q_f32(in);
+
+                // row 0
+                float32x4x2_t out0 = vld2q_f32(out_row_0);
+                out0.val[0] =
+                    neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
+                out0.val[1] =
+                    neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
+                vst2q_f32(out_row_0, out0);
+                out0 = vld2q_f32(out_row_0 + 2);
+                out0.val[0] =
+                    neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
+                out0.val[1] =
+                    neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
+                vst2q_f32(out_row_0 + 2, out0);
+
+                // row 1
+                float32x4x2_t out1 = vld2q_f32(out_row_1);
+                out1.val[0] =
+                    neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
+                out1.val[1] =
+                    neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
+                vst2q_f32(out_row_1, out1);
+                out1 = vld2q_f32(out_row_1 + 2);
+                out1.val[0] =
+                    neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
+                out1.val[1] =
+                    neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
+                vst2q_f32(out_row_1 + 2, out1);
+
+                // row 2
+                float32x4x2_t out2 = vld2q_f32(out_row_2);
+                out2.val[0] =
+                    neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
+                out2.val[1] =
+                    neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
+                vst2q_f32(out_row_2, out2);
+                out2 = vld2q_f32(out_row_2 + 2);
+                out2.val[0] =
+                    neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
+                out2.val[1] =
+                    neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
+                vst2q_f32(out_row_2 + 2, out2);
+
+                // row 3
+                float32x4x2_t out3 = vld2q_f32(out_row_3);
+                out3.val[0] =
+                    neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
+                out3.val[1] =
+                    neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
+                vst2q_f32(out_row_3, out3);
+                out3 = vld2q_f32(out_row_3 + 2);
+                out3.val[0] =
+                    neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
+                out3.val[1] =
+                    neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
+                vst2q_f32(out_row_3 + 2, out3);
+
+                in += 4;
+                out_row_0 += 8;
+                out_row_1 += 8;
+                out_row_2 += 8;
+                out_row_3 += 8;
+                j += 4;
+              }
+
+              for (; j < w; j++) {
+                float val = in[0];
+                for (int k = 0; k < 4; ++k) {
+                  out_row_0[k] += val * k0[k];
+                  out_row_1[k] += val * k1[k];
+                  out_row_2[k] += val * k2[k];
+                  out_row_3[k] += val * k3[k];
+                }
+                in++;
+                out_row_0 += 2;
+                out_row_1 += 2;
+                out_row_2 += 2;
+                out_row_3 += 2;
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b73ed010afdd783f45e39d638db01427070e717
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
@@ -0,0 +1,122 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
+ public:
+  DepthwiseDeconv2dK4x4S1(const std::vector<int> &paddings,
+                          const Padding padding_type,
+                          const FrameworkType framework_type)
+      : Deconv2dBase({1, 1},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~DepthwiseDeconv2dK4x4S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
+ public:
+  DepthwiseDeconv2dK4x4S2(const std::vector<int> &paddings,
+                          const Padding padding_type,
+                          const FrameworkType framework_type)
+      : Deconv2dBase({2, 2},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~DepthwiseDeconv2dK4x4S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class GroupDeconv2dK4x4S1 : public Deconv2dBase {
+ public:
+  GroupDeconv2dK4x4S1(const std::vector<int> &paddings,
+                      const Padding padding_type,
+                      const int group,
+                      const FrameworkType framework_type)
+      : Deconv2dBase({1, 1},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     group,
+                     framework_type) {}
+  virtual ~GroupDeconv2dK4x4S1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class GroupDeconv2dK4x4S2 : public Deconv2dBase {
+ public:
+  GroupDeconv2dK4x4S2(const std::vector<int> &paddings,
+                      const Padding padding_type,
+                      const int group,
+                      const FrameworkType framework_type)
+      : Deconv2dBase({2, 2},
+                     {1, 1},
+                     paddings,
+                     padding_type,
+                     group,
+                     framework_type) {}
+  virtual ~GroupDeconv2dK4x4S2() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a45d5acc6a663d370f1b741b5b15598c9fd40e22
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
@@ -0,0 +1,213 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
+                                             const Tensor *input,
+                                             const Tensor *filter,
+                                             const Tensor *output_shape,
+                                             Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  group_ = input->dim(1);
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_img_size = out_height * out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+  const int kernel_size = kernel_h * kernel_w;
+
+  std::vector<int> index_map(kernel_size, 0);
+  for (int i = 0; i < kernel_h; ++i) {
+    for (int j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * out_width + j;
+    }
+  }
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        float *out_base =
+            padded_out_data + (b * channels + c) * out_img_size;
+        for (index_t i = 0; i < in_height; ++i) {
+          for (index_t j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides_[0] * out_width + j * strides_[1];
+            const index_t input_idx =
+                (b * channels + c) * in_img_size + i * in_width + j;
+            const float val = input_data[input_idx];
+            const index_t kernel_offset = c * kernel_size;
+            for (int k = 0; k < kernel_size; ++k) {
+              const index_t out_idx = out_offset + index_map[k];
+              const index_t kernel_idx = kernel_offset + k;
+              out_base[out_idx] += val * filter_data[kernel_idx];
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, channels, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
+                                         const Tensor *input,
+                                         const Tensor *filter,
+                                         const Tensor *output_shape,
+                                         Tensor *output) {
+  std::unique_ptr<Tensor> padded_out;
+  std::vector<int> out_pad_size;
+  ResizeOutAndPadOut(context,
+                     input,
+                     filter,
+                     output_shape,
+                     output,
+                     &out_pad_size,
+                     &padded_out);
+
+  Tensor *out_tensor = output;
+  if (padded_out != nullptr) {
+    out_tensor = padded_out.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto padded_out_data = out_tensor->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+
+  MACE_CHECK(in_channels % group_ == 0 && out_channels % group_ == 0,
+             "invalid input/output channel and group.");
+
+  const index_t out_img_size = out_height * out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+
+  const int kernel_size = kernel_h * kernel_w;
+  std::vector<int> index_map(kernel_size, 0);
+  for (int i = 0; i < kernel_h; ++i) {
+    for (int j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * out_width + j;
+    }
+  }
+
+  const int in_channels_g = in_channels / group_;
+  const int out_channels_g = out_channels / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1,
+                            index_t start2, index_t end2, index_t step2) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t g = start1; g < end1; g += step1) {
+        for (index_t p = start2; p < end2; p += step2) {
+          const index_t out_base =
+              ((b * group_ + g) * out_channels_g + p) * out_img_size;
+          for (index_t i = 0; i < in_height; ++i) {
+            for (index_t j = 0; j < in_width; ++j) {
+              const index_t out_offset =
+                  i * strides_[0] * out_width + j * strides_[1];
+              for (int q = 0; q < in_channels_g; ++q) {
+                const index_t in_base =
+                    ((b * group_ + g) * in_channels_g + q) * in_img_size;
+                const index_t in_offset =
+                    in_base + i * in_width + j;
+                const float val = input_data[in_offset];
+                const index_t k_offset =
+                    ((p * group_ + g) * in_channels_g + q) * kernel_size;
+                for (int k = 0; k < kernel_size; ++k) {
+                  const index_t out_idx = out_base + out_offset + index_map[k];
+                  const float w = filter_data[k_offset + k];
+                  padded_out_data[out_idx] += val * w;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }, 0, batch, 1, 0, group_, 1, 0, out_channels_g, 1);
+
+  UnPadOutput(*out_tensor, out_pad_size, output);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
new file mode 100644
index 0000000000000000000000000000000000000000..d73480c5ea1a4fff7aa06656efb9a964acc1b01d
--- /dev/null
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
@@ -0,0 +1,84 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+
+class DepthwiseDeconv2dGeneral : public Deconv2dBase {
+ public:
+  DepthwiseDeconv2dGeneral(const std::vector<int> &strides,
+                           const std::vector<int> &dilations,
+                           const std::vector<int> &paddings,
+                           const Padding padding_type,
+                           const FrameworkType framework_type)
+      : Deconv2dBase(strides,
+                     dilations,
+                     paddings,
+                     padding_type,
+                     framework_type) {}
+  virtual ~DepthwiseDeconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+class GroupDeconv2dGeneral : public Deconv2dBase {
+ public:
+  GroupDeconv2dGeneral(const std::vector<int> &strides,
+                       const std::vector<int> &dilations,
+                       const std::vector<int> &paddings,
+                       const Padding padding_type,
+                       const int group,
+                       const FrameworkType framework_type)
+      : Deconv2dBase(strides,
+                     dilations,
+                     paddings,
+                     padding_type,
+                     group,
+                     framework_type) {}
+  virtual ~GroupDeconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc
index 8acde2d17a81602b2c7c667a5aef52573eb31977..aacb6636adf2bb9efb75879db7ca545c9e3a4daf 100644
--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
@@ -39,8 +39,6 @@ MaceStatus Gemm::Compute(const OpContext *context,
                          const bool lhs_batched,
                          const bool rhs_batched,
                          Tensor *output) {
-  MACE_UNUSED(context);
-
   MACE_CHECK(output->size() == batch * rows * cols,
              "Need resize output tensor before call gemm.");
   Tensor::MappingGuard lhs_guard(lhs);
@@ -63,10 +61,8 @@ MaceStatus Gemm::Compute(const OpContext *context,
   const index_t cols_padded = RoundUp(cols, col_block_size);
   const index_t depth_padded = RoundUp(depth, depth_block_size);
 
-  ScratchBuffer *scratch = &tmp_scratch_buffer_;
-  if (context != nullptr && context->device()->scratch_buffer() != nullptr) {
-    scratch = context->device()->scratch_buffer();
-  }
+  ScratchBuffer *scratch = context->device()->scratch_buffer();
+
   index_t packed_lhs_size =
       PadAlignSize(sizeof(float) * rows_padded * depth_padded);
   index_t packed_rhs_size =
@@ -101,6 +97,9 @@ MaceStatus Gemm::Compute(const OpContext *context,
     }
   }
 
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
   for (index_t b = 0; b < batch; ++b) {
     MatrixMap<const float>
         lhs_matrix
@@ -119,17 +118,21 @@ MaceStatus Gemm::Compute(const OpContext *context,
 
     // pack lhs
     if (cached_ != kCacheLhs) {
-#pragma omp parallel for schedule(runtime)
-      for (index_t row_block_idx = 0; row_block_idx < row_block_count;
-           ++row_block_idx) {
-        const index_t start_row = row_block_idx * row_block_size;
-        const index_t
-            row_block_len = std::min(row_block_size, rows - start_row);
-        float *packed_lhs_data_block =
-            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
-        PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
-                packed_lhs_data_block);
-      }
+      thread_pool.Compute1D([=, &lhs_matrix](index_t start,
+                                             index_t end,
+                                             index_t step) {
+        for (index_t row_block_idx = start; row_block_idx < end;
+             row_block_idx += step) {
+          const index_t start_row = row_block_idx * row_block_size;
+          const index_t
+              row_block_len = std::min(row_block_size, rows - start_row);
+          float *packed_lhs_data_block =
+              packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+          PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
+                  packed_lhs_data_block);
+        }
+      }, 0, row_block_count, 1);
+
       if (cache_side == kCacheLhs) {
         cached_ = kCacheLhs;
         if (lhs->UnderlyingBuffer()->OnHost()) {
@@ -142,17 +145,21 @@ MaceStatus Gemm::Compute(const OpContext *context,
 
     // pack rhs
     if (cached_ != kCacheRhs) {
-#pragma omp parallel for schedule(runtime)
-      for (index_t col_block_idx = 0; col_block_idx < col_block_count;
-           ++col_block_idx) {
-        const index_t start_col = col_block_idx * col_block_size;
-        const index_t
-            col_block_len = std::min(col_block_size, cols - start_col);
-        float *packed_rhs_data_block =
-            packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-        PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
-                packed_rhs_data_block);
-      }
+      thread_pool.Compute1D([=, &rhs_matrix](index_t start,
+                                             index_t end,
+                                             index_t step) {
+        for (index_t col_block_idx = start; col_block_idx < end;
+             col_block_idx += step) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
+                  packed_rhs_data_block);
+        }
+      }, 0, col_block_count, 1);
+
       if (cache_side == kCacheRhs) {
         cached_ = kCacheRhs;
         if (rhs->UnderlyingBuffer()->OnHost()) {
@@ -164,35 +171,39 @@ MaceStatus Gemm::Compute(const OpContext *context,
     }
 
     // multiply lhs and rhs
-#pragma omp parallel for schedule(runtime)
-    for (index_t row_block_idx = 0; row_block_idx < row_block_count;
-         ++row_block_idx) {
-      const index_t start_row = row_block_idx * row_block_size;
-      const index_t row_block_len = std::min(row_block_size, rows - start_row);
-      const float *packed_lhs_data_block =
-          packed_lhs_data + row_block_idx * row_block_size * depth_padded;
-
-      for (index_t col_block_idx = 0; col_block_idx < col_block_count;
-           ++col_block_idx) {
-        const index_t start_col = col_block_idx * col_block_size;
+    thread_pool.Compute1D([=, &output_matrix](index_t start,
+                                              index_t end,
+                                              index_t step) {
+      for (index_t row_block_idx = start; row_block_idx < end;
+           row_block_idx += step) {
+        const index_t start_row = row_block_idx * row_block_size;
         const index_t
-            col_block_len = std::min(col_block_size, cols - start_col);
-        const float *packed_rhs_data_block =
-            packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-        float *packed_output_data_block =
-            packed_output_data + row_block_idx * row_block_size * cols_padded
-                + col_block_idx * col_block_size;
-        ComputeBlock(packed_lhs_data_block,
-                     packed_rhs_data_block,
-                     depth_padded,
-                     packed_output_data_block);
-        MatrixMap<float> output_block = output_matrix.block(start_row,
-                                                            start_col,
-                                                            row_block_len,
-                                                            col_block_len);
-        UnpackOutput(packed_output_data_block, &output_block);
-      }  // col_block_idx
-    }  // row_block_idx
+            row_block_len = std::min(row_block_size, rows - start_row);
+        const float *packed_lhs_data_block =
+            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+
+        for (index_t col_block_idx = 0; col_block_idx < col_block_count;
+             ++col_block_idx) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          const float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          float *packed_output_data_block =
+              packed_output_data + row_block_idx * row_block_size * cols_padded
+                  + col_block_idx * col_block_size;
+          ComputeBlock(packed_lhs_data_block,
+                       packed_rhs_data_block,
+                       depth_padded,
+                       packed_output_data_block);
+          MatrixMap<float> output_block = output_matrix.block(start_row,
+                                                              start_col,
+                                                              row_block_len,
+                                                              col_block_len);
+          UnpackOutput(packed_output_data_block, &output_block);
+        }  // col_block_idx
+      }  // row_block_idx
+    }, 0, row_block_count, 1);
   }  // b
 
   return MaceStatus::MACE_SUCCESS;
@@ -530,140 +541,140 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
     MACE_UNUSED(r_depth_block_count);
 
     asm volatile(
-        "mov r0, #0\n"
-        "vdup.f32 q8, r0 \n"
-        "vdup.f32 q9, r0 \n"
-        "vdup.f32 q10, r0 \n"
-        "vdup.f32 q11, r0 \n"
-        "vdup.f32 q12, r0 \n"
-        "vdup.f32 q13, r0 \n"
-        "vdup.f32 q14, r0 \n"
-        "vdup.f32 q15, r0 \n"
-
-        // prelogue
-        "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-
-        "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-        "beq 1f\n"
-
-        "0: \n"
-
-        "vmla.f32 q8, q4, d0[0] \n"
-        "vmla.f32 q9, q5, d0[0] \n"
-        "vmla.f32 q10, q4, d0[1] \n"
-        "vmla.f32 q11, q5, d0[1] \n"
-        "vmla.f32 q12, q4, d1[0] \n"
-        "vmla.f32 q13, q5, d1[0] \n"
-        "vmla.f32 q14, q4, d1[1] \n"
-        "vmla.f32 q15, q5, d1[1] \n"
-
-        "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-
-        "vmla.f32 q8, q6, d2[0] \n"
-        "vmla.f32 q9, q7, d2[0] \n"
-        "vmla.f32 q10, q6, d2[1] \n"
-        "vmla.f32 q11, q7, d2[1] \n"
-        "vmla.f32 q12, q6, d3[0] \n"
-        "vmla.f32 q13, q7, d3[0] \n"
-        "vmla.f32 q14, q6, d3[1] \n"
-        "vmla.f32 q15, q7, d3[1] \n"
-
-        "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-        "vmla.f32 q8, q4, d4[0] \n"
-        "vmla.f32 q9, q5, d4[0] \n"
-        "vmla.f32 q10, q4, d4[1] \n"
-        "vmla.f32 q11, q5, d4[1] \n"
-        "vmla.f32 q12, q4, d5[0] \n"
-        "vmla.f32 q13, q5, d5[0] \n"
-        "vmla.f32 q14, q4, d5[1] \n"
-        "vmla.f32 q15, q5, d5[1] \n"
-
-        "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-
-        "vmla.f32 q8, q6, d6[0] \n"
-        "vmla.f32 q9, q7, d6[0] \n"
-        "vmla.f32 q10, q6, d6[1] \n"
-        "vmla.f32 q11, q7, d6[1] \n"
-        "vmla.f32 q12, q6, d7[0] \n"
-        "vmla.f32 q13, q7, d7[0] \n"
-        "vmla.f32 q14, q6, d7[1] \n"
-        "vmla.f32 q15, q7, d7[1] \n"
-
-        "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-        "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-        "bne 0b \n"
-
-        // prologue
-        "1:\n"
-        "vmla.f32 q8, q4, d0[0] \n"
-        "vmla.f32 q9, q5, d0[0] \n"
-        "vmla.f32 q10, q4, d0[1] \n"
-        "vmla.f32 q11, q5, d0[1] \n"
-        "vmla.f32 q12, q4, d1[0] \n"
-        "vmla.f32 q13, q5, d1[0] \n"
-        "vmla.f32 q14, q4, d1[1] \n"
-        "vmla.f32 q15, q5, d1[1] \n"
-
-        "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-
-        "vmla.f32 q8, q6, d2[0] \n"
-        "vmla.f32 q9, q7, d2[0] \n"
-        "vmla.f32 q10, q6, d2[1] \n"
-        "vmla.f32 q11, q7, d2[1] \n"
-        "vmla.f32 q12, q6, d3[0] \n"
-        "vmla.f32 q13, q7, d3[0] \n"
-        "vmla.f32 q14, q6, d3[1] \n"
-        "vmla.f32 q15, q7, d3[1] \n"
-
-        "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-        "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-        "vmla.f32 q8, q4, d4[0] \n"
-        "vmla.f32 q9, q5, d4[0] \n"
-        "vmla.f32 q10, q4, d4[1] \n"
-        "vmla.f32 q11, q5, d4[1] \n"
-        "vmla.f32 q12, q4, d5[0] \n"
-        "vmla.f32 q13, q5, d5[0] \n"
-        "vmla.f32 q14, q4, d5[1] \n"
-        "vmla.f32 q15, q5, d5[1] \n"
-
-        "vmla.f32 q8, q6, d6[0] \n"
-        "vmla.f32 q9, q7, d6[0] \n"
-        "vmla.f32 q10, q6, d6[1] \n"
-        "vmla.f32 q11, q7, d6[1] \n"
-        "vmla.f32 q12, q6, d7[0] \n"
-        "vmla.f32 q13, q7, d7[0] \n"
-        "vmla.f32 q14, q6, d7[1] \n"
-        "vmla.f32 q15, q7, d7[1] \n"
-
-        "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
-        "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
+    "mov r0, #0\n"
+    "vdup.f32 q8, r0 \n"
+    "vdup.f32 q9, r0 \n"
+    "vdup.f32 q10, r0 \n"
+    "vdup.f32 q11, r0 \n"
+    "vdup.f32 q12, r0 \n"
+    "vdup.f32 q13, r0 \n"
+    "vdup.f32 q14, r0 \n"
+    "vdup.f32 q15, r0 \n"
+
+    // prelogue
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+    "beq 1f\n"
+
+    "0: \n"
+
+    "vmla.f32 q8, q4, d0[0] \n"
+    "vmla.f32 q9, q5, d0[0] \n"
+    "vmla.f32 q10, q4, d0[1] \n"
+    "vmla.f32 q11, q5, d0[1] \n"
+    "vmla.f32 q12, q4, d1[0] \n"
+    "vmla.f32 q13, q5, d1[0] \n"
+    "vmla.f32 q14, q4, d1[1] \n"
+    "vmla.f32 q15, q5, d1[1] \n"
+
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+
+    "vmla.f32 q8, q6, d2[0] \n"
+    "vmla.f32 q9, q7, d2[0] \n"
+    "vmla.f32 q10, q6, d2[1] \n"
+    "vmla.f32 q11, q7, d2[1] \n"
+    "vmla.f32 q12, q6, d3[0] \n"
+    "vmla.f32 q13, q7, d3[0] \n"
+    "vmla.f32 q14, q6, d3[1] \n"
+    "vmla.f32 q15, q7, d3[1] \n"
+
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+
+    "vmla.f32 q8, q4, d4[0] \n"
+    "vmla.f32 q9, q5, d4[0] \n"
+    "vmla.f32 q10, q4, d4[1] \n"
+    "vmla.f32 q11, q5, d4[1] \n"
+    "vmla.f32 q12, q4, d5[0] \n"
+    "vmla.f32 q13, q5, d5[0] \n"
+    "vmla.f32 q14, q4, d5[1] \n"
+    "vmla.f32 q15, q5, d5[1] \n"
+
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+
+    "vmla.f32 q8, q6, d6[0] \n"
+    "vmla.f32 q9, q7, d6[0] \n"
+    "vmla.f32 q10, q6, d6[1] \n"
+    "vmla.f32 q11, q7, d6[1] \n"
+    "vmla.f32 q12, q6, d7[0] \n"
+    "vmla.f32 q13, q7, d7[0] \n"
+    "vmla.f32 q14, q6, d7[1] \n"
+    "vmla.f32 q15, q7, d7[1] \n"
+
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+
+    "bne 0b \n"
+
+    // prologue
+    "1:\n"
+    "vmla.f32 q8, q4, d0[0] \n"
+    "vmla.f32 q9, q5, d0[0] \n"
+    "vmla.f32 q10, q4, d0[1] \n"
+    "vmla.f32 q11, q5, d0[1] \n"
+    "vmla.f32 q12, q4, d1[0] \n"
+    "vmla.f32 q13, q5, d1[0] \n"
+    "vmla.f32 q14, q4, d1[1] \n"
+    "vmla.f32 q15, q5, d1[1] \n"
+
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+
+    "vmla.f32 q8, q6, d2[0] \n"
+    "vmla.f32 q9, q7, d2[0] \n"
+    "vmla.f32 q10, q6, d2[1] \n"
+    "vmla.f32 q11, q7, d2[1] \n"
+    "vmla.f32 q12, q6, d3[0] \n"
+    "vmla.f32 q13, q7, d3[0] \n"
+    "vmla.f32 q14, q6, d3[1] \n"
+    "vmla.f32 q15, q7, d3[1] \n"
+
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+
+    "vmla.f32 q8, q4, d4[0] \n"
+    "vmla.f32 q9, q5, d4[0] \n"
+    "vmla.f32 q10, q4, d4[1] \n"
+    "vmla.f32 q11, q5, d4[1] \n"
+    "vmla.f32 q12, q4, d5[0] \n"
+    "vmla.f32 q13, q5, d5[0] \n"
+    "vmla.f32 q14, q4, d5[1] \n"
+    "vmla.f32 q15, q5, d5[1] \n"
+
+    "vmla.f32 q8, q6, d6[0] \n"
+    "vmla.f32 q9, q7, d6[0] \n"
+    "vmla.f32 q10, q6, d6[1] \n"
+    "vmla.f32 q11, q7, d6[1] \n"
+    "vmla.f32 q12, q6, d7[0] \n"
+    "vmla.f32 q13, q7, d7[0] \n"
+    "vmla.f32 q14, q6, d7[1] \n"
+    "vmla.f32 q15, q7, d7[1] \n"
+
+    "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
     :  // outputs
     [lhs_ptr] "+r"(lhs_ptr),
     [rhs_ptr] "+r"(rhs_ptr),
diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/fp32/gemm.h
index ce226c1a341d76d7f873cb527408688c2e538a8c..00b4d80eef4bf27f98c54f1c77a51765cc7f530d 100644
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
@@ -32,8 +32,7 @@ namespace fp32 {
 class Gemm {
  public:
   explicit Gemm(const bool should_cache_pack)
-      : tmp_scratch_buffer_(GetCPUAllocator()),
-        pack_cache_(GetCPUAllocator()),
+      : pack_cache_(GetCPUAllocator()),
         should_cache_pack_(should_cache_pack),
         cached_(0) {}
   Gemm() : Gemm(false) {}
@@ -126,7 +125,6 @@ class Gemm {
     }
   }
 
-  ScratchBuffer tmp_scratch_buffer_;
   Buffer pack_cache_;
 
   bool should_cache_pack_;
diff --git a/mace/ops/arm/fp32/gemm_test.cc b/mace/ops/arm/fp32/gemm_test.cc
index 372b3eb6e2580c875285c260c8c43c8fc6f0bc51..805720331b193895301b40b408b4eac0b384104c 100644
--- a/mace/ops/arm/fp32/gemm_test.cc
+++ b/mace/ops/arm/fp32/gemm_test.cc
@@ -51,7 +51,11 @@ void TestGemmFloat32(const index_t batch,
     GenerateRandomRealTypeData<float>(output.shape(), output_data);
   }
   ::mace::ops::arm::fp32::Gemm gemm;
-  gemm.Compute(nullptr,
+  utils::ThreadPool thread_pool(1, AFFINITY_NONE);
+  thread_pool.Init();
+  CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
+  OpContext context(nullptr, &cpu_device);
+  gemm.Compute(&context,
                &lhs,
                &rhs,
                batch,
diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc
index 7caa0b5b23d1a9b30d81ce94126bfc2a1a5b82d6..2f2866cf0da86dd70402d28810247821f229d85b 100644
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -48,8 +48,8 @@ MaceStatus Gemv::Compute(const OpContext *context,
                          Tensor *output) {
   MACE_UNUSED(context);
 
-  MACE_CHECK(output->size() >= batch * lhs_height,
-             "Output buffer is not large enough for computing gemv.");
+  MACE_CHECK(output->size() == batch * lhs_height,
+             "Need resize output tensor before call gemv.");
 
   Tensor::MappingGuard lhs_guard(lhs);
   Tensor::MappingGuard rhs_guard(rhs);
@@ -70,24 +70,29 @@ MaceStatus Gemv::Compute(const OpContext *context,
   const index_t w_block_count = lhs_width / w_block_size;
   const index_t w_remain = lhs_width - w_block_size * w_block_count;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t h_block_idx = 0; h_block_idx < h_block_count; ++h_block_idx) {
-      const index_t h_start = h_block_idx * h_block_size;
-      const float
-          *lhs_ptr = lhs_data
-          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
-          + lhs_width * h_start;
-      const float *rhs_ptr =
-          rhs_data + static_cast<index_t>(rhs_batched) * b * lhs_width;
-      float
-          *ret_ptr = output_data + b * lhs_height + h_start;
-
-      const index_t h_block_len =
-          std::min(h_block_size, lhs_height - h_start);
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t h_block_idx = start1; h_block_idx < end1;
+           h_block_idx += step1) {
+        const index_t h_start = h_block_idx * h_block_size;
+        const float
+            *lhs_ptr = lhs_data
+            + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+            + lhs_width * h_start;
+        const float *rhs_ptr =
+            rhs_data + static_cast<index_t>(rhs_batched) * b * lhs_width;
+        float
+            *ret_ptr = output_data + b * lhs_height + h_start;
+
+        const index_t h_block_len =
+            std::min(h_block_size, lhs_height - h_start);
 
 #ifdef MACE_GEMV_UNROLL
-      if (h_block_len == 4) {
+        if (h_block_len == 4) {
         float32x4_t vo0 = vdupq_n_f32(0);
         float32x4_t vo1 = vdupq_n_f32(0);
         float32x4_t vo2 = vdupq_n_f32(0);
@@ -360,10 +365,11 @@ MaceStatus Gemv::Compute(const OpContext *context,
         }  // h
 
 #ifdef MACE_GEMV_UNROLL
-      }  // if
+        }  // if
 #endif  // MACE_GEMV_UNROLL
-    }  // h_block_idx
-  }  // b
+      }  // h_block_idx
+    }  // b
+  }, 0, batch, 1, 0, h_block_count, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/ops/arm/fp32/gemv_test.cc b/mace/ops/arm/fp32/gemv_test.cc
index b6b69254a5827f399a56100c2cebd47e5812412d..bc97bc3ee8ed9c52f62518830cba2b8775973702 100644
--- a/mace/ops/arm/fp32/gemv_test.cc
+++ b/mace/ops/arm/fp32/gemv_test.cc
@@ -49,8 +49,12 @@ void TestGemvFloat32(const index_t batch,
     GenerateRandomRealTypeData<float>(rhs.shape(), rhs_data);
     GenerateRandomRealTypeData<float>(bias.shape(), bias_data);
   }
+  utils::ThreadPool thread_pool(1, AFFINITY_NONE);
+  thread_pool.Init();
+  CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
+  OpContext context(nullptr, &cpu_device);
   ::mace::ops::arm::fp32::Gemv gemv;
-  gemv.Compute(nullptr,
+  gemv.Compute(&context,
                &lhs,
                &rhs,
                &bias,
diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc
index f987da81373282f769f660e5f10e7795413b3be4..bdaa57a640ec6e6d66cd080830211b95c4ceb5b5 100644
--- a/mace/ops/arm/q8/eltwise.cc
+++ b/mace/ops/arm/q8/eltwise.cc
@@ -46,15 +46,9 @@ MaceStatus Eltwise::Compute(const OpContext *context,
   int32_t input0_shift;
   int32_t input1_shift;
   int32_t output_shift;
-  QuantizeMultiplier(adjusted_input0_scale,
-                     &input0_multiplier,
-                     &input0_shift);
-  QuantizeMultiplier(adjusted_input1_scale,
-                     &input1_multiplier,
-                     &input1_shift);
-  QuantizeMultiplier(adjusted_output_scale,
-                     &output_multiplier,
-                     &output_shift);
+  QuantizeMultiplier(adjusted_input0_scale, &input0_multiplier, &input0_shift);
+  QuantizeMultiplier(adjusted_input1_scale, &input1_multiplier, &input1_shift);
+  QuantizeMultiplier(adjusted_output_scale, &output_multiplier, &output_shift);
 
   Tensor::MappingGuard input0_guard(input0);
   Tensor::MappingGuard input1_guard(input1);
@@ -64,89 +58,97 @@ MaceStatus Eltwise::Compute(const OpContext *context,
   auto input1_ptr = input1->data<uint8_t>();
   auto output_ptr = output->mutable_data<uint8_t>();
 
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i <= output->size() - 8; i += 8) {
-    const auto input0_val = vld1_u8(input0_ptr + i);
-    const auto input1_val = vld1_u8(input1_ptr + i);
-    const auto input0_val_s16 =
-        vreinterpretq_s16_u16(vmovl_u8(input0_val));
-    const auto input1_val_s16 =
-        vreinterpretq_s16_u16(vmovl_u8(input1_val));
-    const auto offset_input0 =
-        vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
-    const auto offset_input1 =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
-    auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
-    auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
-    auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
-    auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
-    const auto left_shift_dup = vdupq_n_s32(left_shift);
-    input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
-    input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
-    input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
-    input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
-    input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
-    input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
-    input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
-    input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
-    const auto input0_shift_dup = vdupq_n_s32(input0_shift);
-    const auto input1_shift_dup = vdupq_n_s32(input1_shift);
-    input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
-    input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
-    input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
-    input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
-    int32x4_t res_low, res_high;
-    if (type_ == SUM) {
-      res_low = vaddq_s32(input0_low_s32, input1_low_s32);
-      res_high = vaddq_s32(input0_high_s32, input1_high_s32);
-    } else {
-      res_low = vsubq_s32(input0_low_s32, input1_low_s32);
-      res_high = vsubq_s32(input0_high_s32, input1_high_s32);
-    }
-    res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
-    res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
-    res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
-    res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
-    const auto res_low_s16 = vmovn_s32(res_low);
-    const auto res_high_s16 = vmovn_s32(res_high);
-    const auto output_val = vaddq_s16(vcombine_s16(res_low_s16,
-                                                   res_high_s16),
-                                      vdupq_n_s16(output->zero_point()));
-    vst1_u8(output_ptr + i, vqmovun_s16(output_val));
-  }
+  utils::ThreadPool &thread_pool =
+      context->device()->cpu_runtime()->thread_pool();
+  thread_pool.Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          const auto input0_val = vld1_u8(input0_ptr + i);
+          const auto input1_val = vld1_u8(input1_ptr + i);
+          const auto input0_val_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(input0_val));
+          const auto input1_val_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(input1_val));
+          const auto offset_input0 =
+              vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
+          const auto offset_input1 =
+              vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
+          auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
+          auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
+          auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
+          auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
+          const auto left_shift_dup = vdupq_n_s32(left_shift);
+          input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
+          input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
+          input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
+          input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
+          input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
+          input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
+          input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
+          input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
+          const auto input0_shift_dup = vdupq_n_s32(input0_shift);
+          const auto input1_shift_dup = vdupq_n_s32(input1_shift);
+          input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
+          input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
+          input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
+          input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
+          int32x4_t res_low, res_high;
+          if (type_ == SUM) {
+            res_low = vaddq_s32(input0_low_s32, input1_low_s32);
+            res_high = vaddq_s32(input0_high_s32, input1_high_s32);
+          } else {
+            res_low = vsubq_s32(input0_low_s32, input1_low_s32);
+            res_high = vsubq_s32(input0_high_s32, input1_high_s32);
+          }
+          res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
+          res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
+          res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
+          res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
+          const auto res_low_s16 = vmovn_s32(res_low);
+          const auto res_high_s16 = vmovn_s32(res_high);
+          const auto output_val =
+              vaddq_s16(vcombine_s16(res_low_s16, res_high_s16),
+                        vdupq_n_s16(output->zero_point()));
+          vst1_u8(output_ptr + i, vqmovun_s16(output_val));
+        }
+      },
+      0, output->size() - 7, 8);
 
   index_t handled_output_size = output->size() - output->size() % 8;
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = handled_output_size; i < output->size(); ++i) {
-    const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
-    const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
-    const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
-    const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
-    const int32_t multiplied_input0 =
-        gemmlowp::RoundingDivideByPOT(
-            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
-                                                        input0_multiplier),
-            -input0_shift);
-    const int32_t multiplied_input1 =
-        gemmlowp::RoundingDivideByPOT(
-            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
-                                                        input1_multiplier),
-            -input1_shift);
-
-    int32_t res;
-    if (type_ == SUM) {
-      res = multiplied_input0 + multiplied_input1;
-    } else {
-      res = multiplied_input0 - multiplied_input1;
-    }
-
-    const int32_t output_val =
-        gemmlowp::RoundingDivideByPOT(
-            gemmlowp::SaturatingRoundingDoublingHighMul(res,
-                                                        output_multiplier),
-            -output_shift) + output->zero_point();
-    output_ptr[i] = Saturate<uint8_t>(output_val);
-  }
+
+  thread_pool.Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+          const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+          const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+          const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+          const int32_t multiplied_input0 = gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                          input0_multiplier),
+              -input0_shift);
+          const int32_t multiplied_input1 = gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                          input1_multiplier),
+              -input1_shift);
+
+          int32_t res;
+          if (type_ == SUM) {
+            res = multiplied_input0 + multiplied_input1;
+          } else {
+            res = multiplied_input0 - multiplied_input1;
+          }
+
+          const int32_t output_val =
+              gemmlowp::RoundingDivideByPOT(
+                  gemmlowp::SaturatingRoundingDoublingHighMul(
+                      res, output_multiplier),
+                  -output_shift) +
+              output->zero_point();
+          output_ptr[i] = Saturate<uint8_t>(output_val);
+        }
+      },
+      handled_output_size, output->size(), 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/ops/arm/q8/gemv.cc b/mace/ops/arm/q8/gemv.cc
index ce102e7e3171ff3344b4535576c9187866305fcd..388c68147ff305cf603c95a62293024b7b1db03d 100644
--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
@@ -19,7 +19,7 @@
 #include <algorithm>
 
 #include "mace/utils/math.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 
 #if !defined(__aarch64__)
 
@@ -82,91 +82,94 @@ MaceStatus Gemv<OUTPUT_TYPE>::Compute(const OpContext *context,
       sum_rhs += static_cast<uint32_t>(rhs_base[i]);
     }
 
-#pragma omp parallel for schedule(runtime)
-    for (index_t h = 0; h < lhs_height; ++h) {
-      const uint8_t *lhs_ptr = lhs_data
-          + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
-          + h * lhs_width;
-      const uint8_t *rhs_ptr = rhs_base;
-      OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h;
-
-      uint32_t dot = 0;
-      uint32_t sum_lhs = 0;
-      uint32x4_t vo0_high_u32 = vdupq_n_u32(0);
-      uint32x4_t vo0_low_u32 = vdupq_n_u32(0);
-      uint32x4_t vo1_high_u32 = vdupq_n_u32(0);
-      uint32x4_t vo1_low_u32 = vdupq_n_u32(0);
-      uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0);
-      uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0);
-
-      for (index_t w_block_idx = 0; w_block_idx < w_block_count;
-           ++w_block_idx) {
-        uint8x8_t vl0_u8 = vld1_u8(lhs_ptr);
-        uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8);
-
-        uint8x8_t vr0_u8 = vld1_u8(rhs_ptr);
-        uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8);
-
-        uint16x8_t vl0_u16 = vmovl_u8(vl0_u8);
-        uint16x8_t vl1_u16 = vmovl_u8(vl1_u8);
-
-        uint16x8_t vr0_u16 = vmovl_u8(vr0_u8);
-        uint16x8_t vr1_u16 = vmovl_u8(vr1_u8);
-
-        vo0_high_u32 = vmlal_u16(vo0_high_u32,
-                                 vget_high_u16(vl0_u16),
-                                 vget_high_u16(vr0_u16));
-        vo0_low_u32 = vmlal_u16(vo0_low_u32,
-                                vget_low_u16(vl0_u16),
-                                vget_low_u16(vr0_u16));
-        vo1_high_u32 = vmlal_u16(vo1_high_u32,
-                                 vget_high_u16(vl1_u16),
-                                 vget_high_u16(vr1_u16));
-        vo1_low_u32 = vmlal_u16(vo1_low_u32,
-                                vget_low_u16(vl1_u16),
-                                vget_low_u16(vr1_u16));
-
-        // It can be precuculated if lhs is const, but for this case
-        // computation is not bottleneck
-        sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16),
-                                      vget_high_u16(vl1_u16));
-        sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16),
-                                     vget_low_u16(vl1_u16));
-
-        lhs_ptr += 16;
-        rhs_ptr += 16;
-      }
-
-      vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32);
-      vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32);
-      vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32);
-      dot += vaddvq_u32(vo0_low_u32);
-
-      sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32);
-      sum_lhs = vaddvq_u32(sum_lhs_low_u32);
-
-      for (index_t w = 0; w < w_block_remain; ++w) {
-        dot += (*lhs_ptr) * (*rhs_ptr);
-        sum_lhs += (*lhs_ptr);
-        ++lhs_ptr;
-        ++rhs_ptr;
-      }
-
-      const auto zero_point_dot =
-          static_cast<int32_t>(lhs_zero_point * rhs_zero_point * lhs_width);
-      int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point
-          + zero_point_dot;
-      if (bias) {
-        ret += bias->data<int32_t>()[h];
-      }
-
-      if (is_output_type_uint8_) {
-        *output_ptr =
-            Saturate<uint8_t>(std::roundf(ret * output_multiplier_float));
-      } else {
-        *output_ptr = ret;
-      }
-    }  // h
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t h = start; h < end; h += step) {
+        const uint8_t *lhs_ptr = lhs_data
+            + static_cast<index_t>(lhs_batched) * b * lhs_height * lhs_width
+            + h * lhs_width;
+        const uint8_t *rhs_ptr = rhs_base;
+        OUTPUT_TYPE *output_ptr = output_data + b * lhs_height + h;
+
+        uint32_t dot = 0;
+        uint32_t sum_lhs = 0;
+        uint32x4_t vo0_high_u32 = vdupq_n_u32(0);
+        uint32x4_t vo0_low_u32 = vdupq_n_u32(0);
+        uint32x4_t vo1_high_u32 = vdupq_n_u32(0);
+        uint32x4_t vo1_low_u32 = vdupq_n_u32(0);
+        uint32x4_t sum_lhs_low_u32 = vdupq_n_u32(0);
+        uint32x4_t sum_lhs_high_u32 = vdupq_n_u32(0);
+
+        for (index_t w_block_idx = 0; w_block_idx < w_block_count;
+             ++w_block_idx) {
+          uint8x8_t vl0_u8 = vld1_u8(lhs_ptr);
+          uint8x8_t vl1_u8 = vld1_u8(lhs_ptr + 8);
+
+          uint8x8_t vr0_u8 = vld1_u8(rhs_ptr);
+          uint8x8_t vr1_u8 = vld1_u8(rhs_ptr + 8);
+
+          uint16x8_t vl0_u16 = vmovl_u8(vl0_u8);
+          uint16x8_t vl1_u16 = vmovl_u8(vl1_u8);
+
+          uint16x8_t vr0_u16 = vmovl_u8(vr0_u8);
+          uint16x8_t vr1_u16 = vmovl_u8(vr1_u8);
+
+          vo0_high_u32 = vmlal_u16(vo0_high_u32,
+                                   vget_high_u16(vl0_u16),
+                                   vget_high_u16(vr0_u16));
+          vo0_low_u32 = vmlal_u16(vo0_low_u32,
+                                  vget_low_u16(vl0_u16),
+                                  vget_low_u16(vr0_u16));
+          vo1_high_u32 = vmlal_u16(vo1_high_u32,
+                                   vget_high_u16(vl1_u16),
+                                   vget_high_u16(vr1_u16));
+          vo1_low_u32 = vmlal_u16(vo1_low_u32,
+                                  vget_low_u16(vl1_u16),
+                                  vget_low_u16(vr1_u16));
+
+          // It can be precalculated if lhs is const, but for this case
+          // computation is not bottleneck
+          sum_lhs_high_u32 += vaddl_u16(vget_high_u16(vl0_u16),
+                                        vget_high_u16(vl1_u16));
+          sum_lhs_low_u32 += vaddl_u16(vget_low_u16(vl0_u16),
+                                       vget_low_u16(vl1_u16));
+
+          lhs_ptr += 16;
+          rhs_ptr += 16;
+        }
+
+        vo0_low_u32 = vaddq_u32(vo0_high_u32, vo0_low_u32);
+        vo1_low_u32 = vaddq_u32(vo1_high_u32, vo1_low_u32);
+        vo0_low_u32 = vaddq_u32(vo0_low_u32, vo1_low_u32);
+        dot += vaddvq_u32(vo0_low_u32);
+
+        sum_lhs_low_u32 = vaddq_u32(sum_lhs_high_u32, sum_lhs_low_u32);
+        sum_lhs = vaddvq_u32(sum_lhs_low_u32);
+
+        for (index_t w = 0; w < w_block_remain; ++w) {
+          dot += (*lhs_ptr) * (*rhs_ptr);
+          sum_lhs += (*lhs_ptr);
+          ++lhs_ptr;
+          ++rhs_ptr;
+        }
+
+        const auto zero_point_dot =
+            static_cast<int32_t>(lhs_zero_point * rhs_zero_point * lhs_width);
+        int32_t ret = dot - sum_lhs * rhs_zero_point - sum_rhs * lhs_zero_point
+            + zero_point_dot;
+        if (bias) {
+          ret += bias->data<int32_t>()[h];
+        }
+
+        if (is_output_type_uint8_) {
+          *output_ptr =
+              Saturate<uint8_t>(std::roundf(ret * output_multiplier_float));
+        } else {
+          *output_ptr = ret;
+        }
+      }  // h
+    }, 0, lhs_height, 1);
   }  // b
 
 
diff --git a/mace/ops/arm/q8/gemv_test.cc b/mace/ops/arm/q8/gemv_test.cc
index ced75f64716e4feb2f24603eda4883078c8ade94..6216cabaed02bbfc84ebc4b10adc0a012cdece3e 100644
--- a/mace/ops/arm/q8/gemv_test.cc
+++ b/mace/ops/arm/q8/gemv_test.cc
@@ -54,8 +54,12 @@ void TestGemvInt32(const index_t batch,
     GenerateRandomIntTypeData<int32_t>(bias.shape(), bias_data);
   }
 
+  utils::ThreadPool thread_pool(1, AFFINITY_NONE);
+  thread_pool.Init();
+  CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
+  OpContext context(nullptr, &cpu_device);
   mace::ops::arm::q8::Gemv<int32_t> gemv;
-  gemv.Compute(nullptr,
+  gemv.Compute(&context,
                &lhs,
                &rhs,
                &bias,
@@ -122,8 +126,12 @@ void TestGemvUint8(const index_t batch,
     GenerateRandomIntTypeData<int32_t>(bias.shape(), bias_data);
   }
 
+  utils::ThreadPool thread_pool(1, AFFINITY_NONE);
+  thread_pool.Init();
+  CPUDevice cpu_device(1, AFFINITY_NONE, &thread_pool);
+  OpContext context(nullptr, &cpu_device);
   mace::ops::arm::q8::Gemv<uint8_t> gemv;
-  gemv.Compute(nullptr,
+  gemv.Compute(&context,
                &lhs,
                &rhs,
                &bias,
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 469efe2e0c5eaac299d2622931a5e36154973d8e..c6559032973cdc580aa34b6fe53aaae5f8d585b3 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -18,6 +18,13 @@
 
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
+
+#if defined(MACE_ENABLE_NEON)
+#include "mace/ops/arm/fp32/activation.h"
+#else
+#include "mace/ops/ref/activation.h"
+#endif
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/batch_norm.h"
@@ -27,21 +34,22 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BatchNormOp;
 
-template <>
+template<>
 class BatchNormOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit BatchNormOp(OpConstructContext *context)
       : Operation(context),
         epsilon_(Operation::GetOptionalArg<float>("epsilon",
                                                   static_cast<float>(1e-4))),
-        activation_(ops::StringToActivationType(
-            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
-        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)) {}
+        activation_delegator_(
+            ops::StringToActivationType(
+                Operation::GetOptionalArg<std::string>("activation", "NOOP")),
+            Operation::GetOptionalArg<float>("max_limit", 0.0f),
+            Operation::GetOptionalArg<float>(
+                "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -73,74 +81,85 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
     const index_t height = input->dim(2);
     const index_t width = input->dim(3);
 
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard scale_mapper(scale);
-    Tensor::MappingGuard offset_mapper(offset);
-    Tensor::MappingGuard output_mapper(output);
-
-    const float *input_ptr = input->data<float>();
-    const float *scale_ptr = scale->data<float>();
-    const float *offset_ptr = offset->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-
-    std::vector<float> new_scale;
-    std::vector<float> new_offset;
-    if (not_folded) {
-      const Tensor *mean = this->Input(MEAN);
-      const Tensor *var = this->Input(VAR);
-      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
-                 mean->dim_size());
-      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
-                 var->dim_size());
-      new_scale.resize(channels);
-      new_offset.resize(channels);
-      Tensor::MappingGuard mean_mapper(mean);
-      Tensor::MappingGuard var_mapper(var);
-      const float *mean_ptr = mean->data<float>();
-      const float *var_ptr = var->data<float>();
-#pragma omp parallel for
-      for (index_t c = 0; c < channels; ++c) {
-        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_);
-        new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    {
+      Tensor::MappingGuard input_mapper(input);
+      Tensor::MappingGuard scale_mapper(scale);
+      Tensor::MappingGuard offset_mapper(offset);
+      Tensor::MappingGuard output_mapper(output);
+
+      const float *input_ptr = input->data<float>();
+      const float *scale_ptr = scale->data<float>();
+      const float *offset_ptr = offset->data<float>();
+      float *output_ptr = output->mutable_data<float>();
+
+      std::vector<float> new_scale;
+      std::vector<float> new_offset;
+      if (not_folded) {
+        const Tensor *mean = this->Input(MEAN);
+        const Tensor *var = this->Input(VAR);
+        MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+                   mean->dim_size());
+        MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+                   var->dim_size());
+        new_scale.resize(channels);
+        new_offset.resize(channels);
+        Tensor::MappingGuard mean_mapper(mean);
+        Tensor::MappingGuard var_mapper(var);
+        const float *mean_ptr = mean->data<float>();
+        const float *var_ptr = var->data<float>();
+
+        thread_pool.Compute1D([=, &new_scale, &new_offset](index_t start,
+                                                           index_t end,
+                                                           index_t step) {
+          for (index_t c = start; c < end; c += step) {
+            new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_);
+            new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
+          }
+        }, 0, channels, 1);
       }
-    }
 
-    const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
-    const float
-        *offset_data = not_folded ? new_offset.data() : offset_ptr;
-
-    index_t channel_size = height * width;
-    index_t batch_size = channels * channel_size;
-
-    // NEON is slower, so stick to the trivial implementaion
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        index_t offset = b * batch_size + c * channel_size;
-        for (index_t hw = 0; hw < height * width; ++hw) {
-          output_ptr[offset + hw] =
-              scale_data[c] * input_ptr[offset + hw] + offset_data[c];
+      const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
+      const float
+          *offset_data = not_folded ? new_offset.data() : offset_ptr;
+
+      index_t channel_size = height * width;
+      index_t batch_size = channels * channel_size;
+
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
+            index_t offset = b * batch_size + c * channel_size;
+            for (index_t hw = 0; hw < height * width; ++hw) {
+              output_ptr[offset + hw] =
+                  scale_data[c] * input_ptr[offset + hw] + offset_data[c];
+            }
+          }
         }
-      }
+      }, 0, batch, 1, 0, channels, 1);
     }
-    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_, leakyrelu_coefficient_);
+
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
   float epsilon_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-  const float leakyrelu_coefficient_;
+#ifdef MACE_ENABLE_NEON
+  arm::fp32::Activation activation_delegator_;
+#else
+  ref::Activation activation_delegator_;
+#endif  // MACE_ENABLE_NEON
 
  protected:
   MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class BatchNormOp<DeviceType::GPU, T> : public Operation {
@@ -213,7 +232,6 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterBatchNorm(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index cfd350d458429ea86a68e9176c41108e2469f392..c44501f12e73a92c942d987ac1e51a0fbd1648c9 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -125,7 +125,6 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
         std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
 
     // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t c = 0; c < channels; ++c) {
       for (index_t block_h = 0; block_h < in_height;
            block_h += block_h_size) {
@@ -214,7 +213,6 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
     index_t out_width = space_tensor->dim(2);
     index_t channels = space_tensor->dim(3);
 
-#pragma omp parallel for schedule(runtime)
     for (index_t in_b = 0; in_b < in_batches; ++in_b) {
       const index_t b = in_b % out_batches;
       const index_t tile_index = in_b / out_batches;
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 6606c2c257b2ead3dd756a8477e39f383a25b37c..9351de79518ee71671f7595f39f2c410a7e7b265 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -18,6 +18,13 @@
 
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
+
+#ifdef MACE_ENABLE_NEON
+#include "mace/ops/arm/fp32/bias_add.h"
+#else
+#include "mace/ops/ref/bias_add.h"
+#endif  // MACE_ENABLE_NEON
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/bias_add.h"
@@ -47,36 +54,26 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
                bias->dim_size());
 
     Tensor *output = this->Output(0);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
+    if (input->dim_size() == 4 && has_data_format_) {
+      bias_add_delegator_.Compute(context, input, bias, output);
+    } else {
+      // TODO(liyin): remove it and tranform bias to add (eltwise)
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
-    const float *input_ptr = input->data<float>();
-    const float *bias_ptr = bias->data<float>();
-    float *output_ptr = output->mutable_data<float>();
+      Tensor::MappingGuard input_mapper(input);
+      Tensor::MappingGuard bias_mapper(bias);
+      Tensor::MappingGuard output_mapper(output);
 
-    if (input->dim_size() == 4 && has_data_format_) {
-      const index_t batch = input->dim(0);
-      const index_t channels = input->dim(1);
-      const index_t height_width = input->dim(2) * input->dim(3);
+      const float *input_ptr = input->data<float>();
+      const float *bias_ptr = bias->data<float>();
+      float *output_ptr = output->mutable_data<float>();
 
-#pragma omp parallel for collapse(2)
-      for (index_t n = 0; n < batch; ++n) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t hw = 0; hw < height_width; ++hw) {
-            index_t pos = (n * channels + c) * height_width + hw;
-            output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
-          }
-        }
-      }
-    } else {
       const std::vector<index_t> &shape = input->shape();
       const index_t fused_batch = std::accumulate(
           shape.begin(), shape.end() - 1, 1, std::multiplies<index_t>());
       const index_t channels = *shape.rbegin();
-#pragma omp parallel for
+
       for (index_t n = 0; n < fused_batch; ++n) {
         index_t pos = n * channels;
         for (index_t c = 0; c < channels; ++c) {
@@ -91,6 +88,11 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 
  private:
   int has_data_format_;
+#ifdef MACE_ENABLE_NEON
+  arm::fp32::BiasAdd bias_add_delegator_;
+#else
+  ref::BiasAdd bias_add_delegator_;
+#endif  // MACE_ENABLE_NEON
 };
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 70e1811a07292af8eb0982caf46decb393f28325..966b5d57347b9405d3d43d9c113b00de3d38ce3e 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -56,7 +56,6 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
     index_t batch_size = channels * image_size;
     index_t channels_per_group = channels / groups_;
 
-#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
         index_t g = c % groups_;
diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc
index ade33c59002d3924123eede8687269de3abb2119..2ca95a7d75986c03c81d80f9ce0365d53df7005b 100644
--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -76,16 +76,14 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
       output_height = (input_height - k_extent_height) / strides[0] + 1;
       output_width = (input_width - k_extent_width) / strides[1] + 1;
       break;
-    case SAME:
-      output_height = (input_height - 1) / strides[0] + 1;
+    case SAME:output_height = (input_height - 1) / strides[0] + 1;
       output_width = (input_width - 1) / strides[1] + 1;
       break;
     case FULL:
       output_height = (input_height + k_extent_height - 2) / strides[0] + 1;
       output_width = (input_width + k_extent_width - 2) / strides[1] + 1;
       break;
-    default:
-      MACE_CHECK(false, "Unsupported padding type: ", padding);
+    default:MACE_CHECK(false, "Unsupported padding type: ", padding);
   }
 
   // Note: TensorFlow may padded one more on the right/bottom side
@@ -210,20 +208,6 @@ void CalcOutputSize(const index_t *input_shape,
   }
 }
 
-void CalcNCHWInputShape(const index_t *output_shape,
-                        const index_t *filter_shape,
-                        const int *strides,
-                        const int *dilations,
-                        index_t *input_shape) {
-  MACE_CHECK_NOTNULL(input_shape);
-  input_shape[0] = output_shape[0];
-  input_shape[1] = filter_shape[1];
-  input_shape[2] = (output_shape[2] - 1) * strides[0] +
-      (filter_shape[2] - 1) * dilations[0] + 1;
-  input_shape[3] = (output_shape[3] - 1) * strides[1] +
-      (filter_shape[3] - 1) * dilations[1] + 1;
-}
-
 void CalcOutputSize(const index_t *input_shape,   // NHWC
                     const index_t *filter_shape,  // OIHW
                     const int *padding_size,
@@ -236,231 +220,202 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
 }
 
 void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW
-                    const index_t *filter_shape,  // OIHW
-                    const int *padding_size,
-                    const int *dilations,
-                    const int *strides,
-                    const RoundType round_type,
-                    index_t *output_shape) {
+                        const index_t *filter_shape,  // OIHW
+                        const int *padding_size,
+                        const int *dilations,
+                        const int *strides,
+                        const RoundType round_type,
+                        index_t *output_shape) {
   CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations,
                  strides, round_type, output_shape);
 }
 
-void CalPaddingSize(const index_t *input_shape,   // NCHW
-                    const index_t *filter_shape,  // OIHW
-                    const int *strides,
-                    const int *dilations,
-                    Padding padding,
-                    int *padding_size) {
-  MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
-             "Invalid dilations, must >= 1");
-  MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
-                 (dilations[1] == 1 || strides[1] == 1),
-             "If dilations > 1, strides should be 1");
-  MACE_CHECK_NOTNULL(padding_size);
-
-  index_t output_height = 0, output_width = 0;
-  index_t k_extent_height = (filter_shape[2] - 1) * dilations[0] + 1;
-  index_t k_extent_width = (filter_shape[3] - 1) * dilations[1] + 1;
-
-  switch (padding) {
+void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
+                        const std::vector<index_t> &filter_shape,
+                        const std::vector<index_t> &output_shape,
+                        const std::vector<int> &strides,
+                        Padding padding_type,
+                        const int group,
+                        std::vector<int> *in_pad_size,
+                        std::vector<int> *out_pad_size,
+                        std::vector<index_t> *padded_out_shape,
+                        DataFormat data_format) {
+  const index_t
+      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+  const index_t
+      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+
+  const index_t
+      out_height = data_format == NCHW ? output_shape[2] : output_shape[1];
+  const index_t
+      out_width = data_format == NCHW ? output_shape[3] : output_shape[2];
+
+  const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
+  const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
+
+  const index_t kernel_h = filter_shape[2];
+  const index_t kernel_w = filter_shape[3];
+
+  index_t expected_input_height = 0, expected_input_width = 0;
+
+  switch (padding_type) {
     case VALID:
-      output_height = (input_shape[2] - k_extent_height) / strides[0] + 1;
-      output_width = (input_shape[3] - k_extent_width) / strides[1] + 1;
+      expected_input_height =
+          (out_height - kernel_h + strides[0]) / strides[0];
+      expected_input_width =
+          (out_width - kernel_w + strides[1]) / strides[1];
       break;
     case SAME:
-      output_height = (input_shape[2] - 1) / strides[0] + 1;
-      output_width = (input_shape[3] - 1) / strides[1] + 1;
-      break;
-    case FULL:
-      output_height = (input_shape[2] + k_extent_height - 2) / strides[0] + 1;
-      output_width = (input_shape[3] + k_extent_width - 2) / strides[1] + 1;
+      expected_input_height =
+          (out_height + strides[0] - 1) / strides[0];
+      expected_input_width =
+          (out_width + strides[1] - 1) / strides[1];
       break;
-    default:
-      MACE_CHECK(false, "Unsupported padding type: ", padding);
+    default:MACE_CHECK(false, "Unsupported padding type: ", padding_type);
   }
 
-  // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO(liuqi): may be it's better to also truncate the left/top to
-  // utilize the more centered features. We need to benchmark
-  // based on the model accuracy.
-  padding_size[0] = std::max<int>(
-      0, (output_height - 1) * strides[0] + k_extent_height - input_shape[2]);
-  padding_size[1] = std::max<int>(
-      0, (output_width - 1) * strides[1] + k_extent_width - input_shape[3]);
-}
-
-
-MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
-                                   const int *paddings,
-                                   Tensor *output_tensor,
-                                   bool padding_same_value) {
-  Tensor::MappingGuard input_mapper(input_tensor);
-  const float *input = input_tensor->data<float>();
-  const index_t *input_shape = input_tensor->shape().data();
-
-  index_t batch = input_shape[0];
-  index_t channels = input_shape[1];
-  index_t height = input_shape[2];
-  index_t width = input_shape[3];
-
-  std::vector<index_t> output_shape(
-    {batch, channels, paddings[0] + height, paddings[1] + width});
-
-  const index_t output_width = output_shape[3];
-  const int padded_top = paddings[0] / 2;
-  const int padded_left = paddings[1] / 2;
-
-  MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
-
-  Tensor::MappingGuard padded_output_mapper(output_tensor);
-  float *output_data = output_tensor->mutable_data<float>();
-  memset(output_data, 0, output_tensor->size() * sizeof(float));
-
-  // Skip the padded top rows
-  if (padding_same_value) {
-#define MACE_COPY_INPUT                                                 \
-  std::fill(output_data, output_data + padded_left, input[0]);          \
-  output_data += padded_left;                                           \
-  memcpy(output_data, input, width * sizeof(float));                    \
-  output_data += width;                                                 \
-  std::fill(output_data, output_data + padded_right, input[width - 1]); \
-  output_data += padded_right;
-
-    const int padded_bottom = paddings[0] - padded_top;
-    const int padded_right = paddings[1] - padded_left;
-
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        for (int k = 0; k < padded_top; ++k) {
-          MACE_COPY_INPUT;
-        }
-        for (int k = 0; k < height; ++k) {
-          MACE_COPY_INPUT;
-          input += width;
-        }
-        input -= width;
-        for (int k = 0; k < padded_bottom; ++k) {
-          MACE_COPY_INPUT;
-        }
-        input += width;
-      }
-    }
-#undef MACE_COPY_INPUT
-  } else {
-    output_data += padded_top * output_width;
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        for (int k = 0; k < height; ++k) {
-          memcpy(output_data + padded_left, input, width * sizeof(float));
-          input += width;
-          output_data += output_width;
-        }
-        // Skip the padded bottom in this channel and top in the next channel
-        output_data += paddings[0] * output_width;
-      }
-    }
+  MACE_CHECK(expected_input_height == in_height,
+             expected_input_height, "!=", in_height);
+  MACE_CHECK(expected_input_width == in_width,
+             expected_input_width, "!=", in_width);
+
+  const index_t padded_out_height =
+      (in_height - 1) * strides[0] + kernel_h;
+  const index_t padded_out_width =
+      (in_width - 1) * strides[1] + kernel_w;
+
+  if (in_pad_size != nullptr) {
+    const int p_h =
+        static_cast<int>(out_height + kernel_h - 1 - extended_in_height);
+    const int p_w =
+        static_cast<int>(out_width + kernel_w - 1 - extended_in_width);
+    in_pad_size->resize(2);
+    (*in_pad_size)[0] = std::max<int>(0, p_h);
+    (*in_pad_size)[1] = std::max<int>(0, p_w);
   }
 
-  return MaceStatus::MACE_SUCCESS;
-}
-
-MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
-                                           const int pad_top,
-                                           const int pad_bottom,
-                                           const int pad_left,
-                                           const int pad_right,
-                                           Tensor *output_tensor) {
-  const float *input = input_tensor->data<float>();
-  const index_t *input_shape = input_tensor->shape().data();
-
-  index_t batch = input_shape[0];
-  index_t channels = input_shape[1];
-  index_t height = input_shape[2];
-  index_t width = input_shape[3];
-
-  const int pad_height = pad_top + pad_bottom;
-  const int pad_width = pad_left + pad_right;
-  std::vector<index_t> output_shape(
-    {batch, channels, height + pad_height, width + pad_width});
-  MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
-  output_tensor->Clear();
-  Tensor::MappingGuard padded_output_mapper(output_tensor);
-  float *output_data = output_tensor->mutable_data<float>();
-
-  const index_t output_height = output_shape[2];
-  const index_t output_width = output_shape[3];
-  const index_t in_image_size = height * width;
-  const index_t out_image_size = output_height * output_width;
-  const index_t in_batch_size = channels * in_image_size;
-  const index_t out_batch_size = channels * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      for (int k = 0; k < height; ++k) {
-        memcpy(output_data + i * out_batch_size + j * out_image_size
-                 + (pad_top + k) * output_width + pad_left,
-               input + i * in_batch_size + j * in_image_size + k * width,
-               width * sizeof(float));
-      }
-      // Skip the padded bottom in this channel and top in the next channel
-    }
+  if (out_pad_size != nullptr) {
+    const int o_p_h = static_cast<int>(padded_out_height - out_height);
+    const int o_p_w = static_cast<int>(padded_out_width - out_width);
+    out_pad_size->resize(2);
+    (*out_pad_size)[0] = std::max<int>(0, o_p_h);
+    (*out_pad_size)[1] = std::max<int>(0, o_p_w);
   }
 
-  return MaceStatus::MACE_SUCCESS;
+  if (padded_out_shape != nullptr) {
+    index_t output_channel = filter_shape[0] * group;
+    padded_out_shape->resize(4);
+    (*padded_out_shape)[0] = output_shape[0];
+    (*padded_out_shape)[1] =
+        data_format == NCHW ? output_channel : padded_out_height;
+    (*padded_out_shape)[2] =
+        data_format == NCHW ? padded_out_height : padded_out_width;
+    (*padded_out_shape)[3] =
+        data_format == NCHW ? padded_out_width : output_channel;
+  }
 }
 
+void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
+                           const std::vector<index_t> &filter_shape,
+                           const std::vector<int> &strides,
+                           const std::vector<int> &out_pad_size,
+                           const int group,
+                           std::vector<index_t> *out_shape,
+                           std::vector<int> *in_pad_size,
+                           std::vector<index_t> *padded_out_shape,
+                           DataFormat data_format) {
+  const index_t
+      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+  const index_t
+      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+
+  const index_t output_channel = filter_shape[0] * group;
+
+  const index_t kernel_h = filter_shape[2];
+  const index_t kernel_w = filter_shape[3];
+
+  index_t padded_out_height =
+      (in_height - 1) * strides[0] + kernel_h;
+  index_t padded_out_width =
+      (in_width - 1) * strides[1] + kernel_w;
+
+  if (in_pad_size != nullptr) {
+    in_pad_size->resize(2);
+    (*in_pad_size)[0] = static_cast<int>((kernel_h - 1) * 2 - out_pad_size[0]);
+    (*in_pad_size)[1] = static_cast<int>((kernel_w - 1) * 2 - out_pad_size[1]);
+    (*in_pad_size)[0] = std::max<int>(0, (*in_pad_size)[0]);
+    (*in_pad_size)[1] = std::max<int>(0, (*in_pad_size)[1]);
+  }
 
-MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
-                                   const int *paddings,
-                                   Tensor *output_tensor,
-                                   bool padding_same_value) {
-  Tensor::MappingGuard input_mapper(input_tensor);
-  const float *input = input_tensor->data<float>();
-  const index_t *input_shape = input_tensor->shape().data();
-
-  index_t batch = input_shape[0];
-  index_t height = input_shape[1];
-  index_t width = input_shape[2];
-  index_t channels = input_shape[3];
-
-  std::vector<index_t> output_shape(
-      {batch, paddings[0] + height, paddings[1] + width, channels});
-
-  const int output_height = output_shape[1];
-  const int output_width = output_shape[2];
-  const int padded_top = paddings[0] / 2;
-  const int padded_left = paddings[1] / 2;
-
-  MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
+  if (padded_out_shape != nullptr) {
+    padded_out_shape->resize(4);
+    (*padded_out_shape)[0] = input_shape[0];
+    (*padded_out_shape)[1] =
+        data_format == NCHW ? output_channel : padded_out_height;
+    (*padded_out_shape)[2] =
+        data_format == NCHW ? padded_out_height : padded_out_width;
+    (*padded_out_shape)[3] =
+        data_format == NCHW ? padded_out_width : output_channel;
+  }
 
-  Tensor::MappingGuard padded_output_mapper(output_tensor);
-  float *output_data = output_tensor->mutable_data<float>();
-  memset(output_data, 0, output_tensor->size() * sizeof(float));
+  if (out_shape != nullptr) {
+    index_t out_height = padded_out_height - out_pad_size[0];
+    index_t out_width = padded_out_width - out_pad_size[1];
+    out_shape->resize(4);
+    (*out_shape)[0] = input_shape[0];
+    (*out_shape)[1] = data_format == NCHW ? output_channel : out_height;
+    (*out_shape)[2] = data_format == NCHW ? out_height : out_width;
+    (*out_shape)[3] = data_format == NCHW ? out_width : output_channel;
+  }
+}
 
-  // Skip the padded top rows
-  if (padding_same_value) {
-    LOG(FATAL) << "Not implemented";
-  } else {
-#pragma omp parallel for collapse(3) schedule(runtime)
-    for (int n = 0; n < batch; ++n) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          const float *input_ptr =
-              input + ((n * height + h) * width + w) * channels;
-          float *output_ptr =
-              output_data +
-              ((n * output_height + h + padded_top) * output_width + w +
-               padded_left) *
-                  channels;
-          memcpy(output_ptr, input_ptr, channels * sizeof(float));
-        }
-      }
+void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
+                                    const std::vector<index_t> &filter_shape,
+                                    const std::vector<int> &strides,
+                                    Padding padding_type,
+                                    const std::vector<int> &paddings,
+                                    int group,
+                                    std::vector<index_t> *output_shape,
+                                    std::vector<int> *in_pad_size,
+                                    std::vector<int> *out_pad_size,
+                                    std::vector<index_t> *padded_out_shape,
+                                    FrameworkType framework_type,
+                                    DataFormat data_format) {
+  if (framework_type == FrameworkType::TENSORFLOW) {
+    MACE_CHECK(output_shape->size() == 4,
+               "deconv output shape shoud be 4-dims");
+    std::vector<index_t> &out_shape = *output_shape;
+    if (data_format == NCHW) {
+      const index_t t = out_shape[1];
+      out_shape[1] = out_shape[3];
+      out_shape[3] = out_shape[2];
+      out_shape[2] = t;
     }
-  }
 
-  return MaceStatus::MACE_SUCCESS;
+    CalcDeconvShape_TF(
+        input_shape,
+        filter_shape,
+        *output_shape,
+        strides,
+        padding_type,
+        group,
+        in_pad_size,
+        out_pad_size,
+        padded_out_shape,
+        data_format);
+  } else {  // caffe
+    if (!paddings.empty()) *out_pad_size = paddings;
+    CalcDeconvShape_Caffe(
+        input_shape,
+        filter_shape,
+        strides,
+        *out_pad_size,
+        group,
+        output_shape,
+        in_pad_size,
+        padded_out_shape,
+        data_format);
+  }
 }
 
 }  // namespace ops
diff --git a/mace/ops/common/conv_pool_2d_util.h b/mace/ops/common/conv_pool_2d_util.h
index e8d0d335f1e0900cf1c265817cbcd73dd63c66b3..389575d76a78b7154887865f203ee8c29f059a4d 100644
--- a/mace/ops/common/conv_pool_2d_util.h
+++ b/mace/ops/common/conv_pool_2d_util.h
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_
 #define MACE_OPS_COMMON_CONV_POOL_2D_UTIL_H_
 
+#include <vector>
 #include "mace/core/tensor.h"
 
 namespace mace {
@@ -77,41 +78,25 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
                     index_t *output_shape);
 
 void CalcNCHWOutputSize(const index_t *input_shape,
-                    const index_t *filter_shape,
-                    const int *padding_size,
-                    const int *dilations,
-                    const int *strides,
-                    const RoundType round_type,
-                    index_t *output_shape);
-
-void CalcNCHWInputShape(const index_t *output_shape,
                         const index_t *filter_shape,
-                        const int *strides,
+                        const int *padding_size,
                         const int *dilations,
-                        index_t *input_shape);
-
-void CalPaddingSize(const index_t *input_shape,   // NCHW
-                    const index_t *filter_shape,  // OIHW
-                    const int *dilations,
-                    const int *strides,
-                    Padding padding,
-                    int *padding_size);
-
-
-MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
-                               const int pad_top, const int pad_bottom,
-                               const int pad_left, const int pad_right,
-                               Tensor *output_tensor);
-
-MaceStatus ConstructNCHWInputWithPadding(const Tensor *input,
-                                   const int *paddings,
-                                   Tensor *output_tensor,
-                                   bool padding_same_value = false);
-
-MaceStatus ConstructNHWCInputWithPadding(const Tensor *input,
-                                   const int *paddings,
-                                   Tensor *output_tensor,
-                                   bool padding_same_value = false);
+                        const int *strides,
+                        const RoundType round_type,
+                        index_t *output_shape);
+
+void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
+                                    const std::vector<index_t> &filter_shape,
+                                    const std::vector<int> &strides,
+                                    Padding padding_type,
+                                    const std::vector<int> &paddings,
+                                    int group,
+                                    std::vector<index_t> *output_shape,
+                                    std::vector<int> *in_pad_size,
+                                    std::vector<int> *out_pad_size,
+                                    std::vector<index_t> *padded_out_shape,
+                                    FrameworkType framework_type,
+                                    DataFormat data_format);
 
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/common/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h
index c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500..a01ec82ef68cd84897d1090e7e958d8807fae214 100644
--- a/mace/ops/common/gemmlowp_util.h
+++ b/mace/ops/common/gemmlowp_util.h
@@ -19,7 +19,7 @@
 
 #include "public/gemmlowp.h"
 #include "mace/core/types.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 
 namespace mace {
 
diff --git a/mace/ops/common/lstm.cc b/mace/ops/common/lstm.cc
index beea3f5b8081584b219cd6c662c4451dfe4cc223..cde148e1560168b7ddd9138a7fb4847663bc9de2 100644
--- a/mace/ops/common/lstm.cc
+++ b/mace/ops/common/lstm.cc
@@ -21,7 +21,8 @@
 namespace mace {
 namespace ops {
 
-void LSTMNonlinearKernel(const float *input_data,
+void LSTMNonlinearKernel(const OpContext *context,
+                         const float *input_data,
                          const float *prev_data,
                          const float *scale_data,
                          const float *params_data,
@@ -34,41 +35,44 @@ void LSTMNonlinearKernel(const float *input_data,
   float f_scale = (embed_scales && scale_data) ? scale_data[1] : 1.0f;
   float o_scale = (embed_scales && scale_data) ? scale_data[2] : 1.0f;
 
-  if (prev_data == nullptr) {
-#pragma omp parallel for schedule(runtime)
-    for (int c = 0; c < cell_dim; ++c) {
-      float i_part = input_data[c];
-      float c_part = input_data[c + 2 * cell_dim];
-      float o_part = input_data[c + 3 * cell_dim];
-      float w_oc = params_data[c + params_stride * 2];
-      float i_t = ScalarSigmoid(i_part);
-      float c_t = i_t * i_scale * std::tanh(c_part);
-      float o_t = ScalarSigmoid(o_part + w_oc * c_t);
-      float m_t = o_t * o_scale * std::tanh(c_t);
-      output_cell[c] = c_t;
-      output_data[c] = m_t;
-    }
-  } else {
-#pragma omp parallel for schedule(runtime)
-    for (int c = 0; c < cell_dim; ++c) {
-      float i_part = input_data[c];
-      float f_part = input_data[c + cell_dim];
-      float c_part = input_data[c + 2 * cell_dim];
-      float o_part = input_data[c + 3 * cell_dim];
-      float c_prev = prev_data[c];
-      float w_ic = params_data[c];
-      float w_fc = params_data[c + params_stride];
-      float w_oc = params_data[c + params_stride * 2];
-      float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
-      float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
-      float c_t =
-          f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
-      float o_t = ScalarSigmoid(o_part + w_oc * c_t);
-      float m_t = o_t * o_scale * std::tanh(c_t);
-      output_cell[c] = c_t;
-      output_data[c] = m_t;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+    if (prev_data == nullptr) {
+      for (index_t c = start; c < end; c += step) {
+        float i_part = input_data[c];
+        float c_part = input_data[c + 2 * cell_dim];
+        float o_part = input_data[c + 3 * cell_dim];
+        float w_oc = params_data[c + params_stride * 2];
+        float i_t = ScalarSigmoid(i_part);
+        float c_t = i_t * i_scale * std::tanh(c_part);
+        float o_t = ScalarSigmoid(o_part + w_oc * c_t);
+        float m_t = o_t * o_scale * std::tanh(c_t);
+        output_cell[c] = c_t;
+        output_data[c] = m_t;
+      }
+    } else {
+      for (index_t c = start; c < end; c += step) {
+        float i_part = input_data[c];
+        float f_part = input_data[c + cell_dim];
+        float c_part = input_data[c + 2 * cell_dim];
+        float o_part = input_data[c + 3 * cell_dim];
+        float c_prev = prev_data[c];
+        float w_ic = params_data[c];
+        float w_fc = params_data[c + params_stride];
+        float w_oc = params_data[c + params_stride * 2];
+        float i_t = ScalarSigmoid(i_part + w_ic * c_prev);
+        float f_t = ScalarSigmoid(f_part + w_fc * c_prev);
+        float c_t =
+            f_t * f_scale * c_prev + i_t * i_scale * std::tanh(c_part);
+        float o_t = ScalarSigmoid(o_part + w_oc * c_t);
+        float m_t = o_t * o_scale * std::tanh(c_t);
+        output_cell[c] = c_t;
+        output_data[c] = m_t;
+      }
     }
-  }
+  }, 0, cell_dim, 1);
 }
 
 }  // namespace ops
diff --git a/mace/ops/common/lstm.h b/mace/ops/common/lstm.h
index b835386041b6ba86f13818fe4f57c1efb1dff15d..d9e4024894dba1a7c3995e8239ef0a9e814a50e9 100644
--- a/mace/ops/common/lstm.h
+++ b/mace/ops/common/lstm.h
@@ -16,10 +16,13 @@
 #define MACE_OPS_COMMON_LSTM_H_
 
 #include "mace/core/types.h"
+#include "mace/core/op_context.h"
+
 namespace mace {
 namespace ops {
 
-void LSTMNonlinearKernel(const float *input_data,
+void LSTMNonlinearKernel(const OpContext *opContext,
+                         const float *input_data,
                          const float *prev_data,
                          const float *scale_data,
                          const float *params_data,
@@ -29,7 +32,6 @@ void LSTMNonlinearKernel(const float *input_data,
                          float *output_cell,
                          float *output_data);
 
-
 }  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/common/transpose.cc b/mace/ops/common/transpose.cc
deleted file mode 100644
index 79a7a6be064368f34864fee115af6d7735b50a83..0000000000000000000000000000000000000000
--- a/mace/ops/common/transpose.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/common/transpose.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace mace {
-namespace ops {
-
-namespace transpose {
-void TransposeNHWCToNCHWC3(const float *input,
-                           float *output,
-                           const index_t height,
-                           const index_t width) {
-  index_t image_size = height * width;
-
-#pragma omp parallel for
-  for (index_t h = 0; h < height; ++h) {
-    index_t in_offset = h * width * 3;
-    index_t out_offset = h * width;
-
-#if defined(MACE_ENABLE_NEON)
-    index_t w;
-    for (w = 0; w + 3 < width; w += 4) {
-      float32x4x3_t vi = vld3q_f32(input + in_offset);
-      vst1q_f32(output + out_offset, vi.val[0]);
-      vst1q_f32(output + out_offset + image_size, vi.val[1]);
-      vst1q_f32(output + out_offset + image_size * 2, vi.val[2]);
-
-      in_offset += 12;
-      out_offset += 4;
-    }
-    for (; w < width; ++w) {
-      for (index_t c = 0; c < 3; ++c) {
-        output[h * width + image_size * c + w] =
-          input[h * width * 3 + w * 3 + c];
-      }
-    }
-#else
-    for (index_t w = 0; w < width; ++w) {
-      for (index_t c = 0; c < 3; ++c) {
-        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
-      }
-    }
-#endif
-  }
-}
-
-void TransposeNCHWToNHWCC2(const float *input,
-                           float *output,
-                           const index_t height,
-                           const index_t width) {
-  index_t image_size = height * width;
-#pragma omp parallel for
-  for (index_t h = 0; h < height; ++h) {
-    index_t in_offset = h * width;
-    index_t out_offset = h * width * 2;
-
-#if defined(MACE_ENABLE_NEON)
-    index_t w;
-    for (w = 0; w + 3 < width; w += 4) {
-      float32x4_t vi0 = vld1q_f32(input + in_offset);
-      float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
-      float32x4x2_t vi = {vi0, vi1};
-      vst2q_f32(output + out_offset, vi);
-      in_offset += 4;
-      out_offset += 8;
-    }
-    for (; w < width; ++w) {
-      for (index_t c = 0; c < 2; ++c) {
-        output[h * width * 2 + w * 2 + c] =
-          input[h * width + image_size * c + w];
-      }
-    }
-#else
-    for (index_t w = 0; w < width; ++w) {
-      for (index_t c = 0; c < 2; ++c) {
-        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
-      }
-    }
-#endif
-  }
-}
-
-void TransposeNHWCToNCHWC3(const int *input,
-                           int *output,
-                           const index_t height,
-                           const index_t width) {
-  index_t image_size = height * width;
-
-#pragma omp parallel for
-  for (index_t h = 0; h < height; ++h) {
-    index_t in_offset = h * width * 3;
-    index_t out_offset = h * width;
-
-    for (index_t w = 0; w < width; ++w) {
-      for (index_t c = 0; c < 3; ++c) {
-        output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
-      }
-    }
-  }
-}
-
-void TransposeNCHWToNHWCC2(const int *input,
-                           int *output,
-                           const index_t height,
-                           const index_t width) {
-  index_t image_size = height * width;
-#pragma omp parallel for
-  for (index_t h = 0; h < height; ++h) {
-    index_t in_offset = h * width;
-    index_t out_offset = h * width * 2;
-
-    for (index_t w = 0; w < width; ++w) {
-      for (index_t c = 0; c < 2; ++c) {
-        output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
-      }
-    }
-  }
-}
-}  // namespace transpose
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h
index 4d2e5a519e680276884fb95ad6edf088738c99d0..0c0751851f695ac9974bf3e386b32adf2cf28370 100644
--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -15,43 +15,152 @@
 #ifndef MACE_OPS_COMMON_TRANSPOSE_H_
 #define MACE_OPS_COMMON_TRANSPOSE_H_
 
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif  // MACE_ENABLE_NEON
 #include <algorithm>
 #include <vector>
-
+#include "mace/core/op_context.h"
 #include "mace/public/mace.h"
-#include "mace/core/tensor.h"
 
 namespace mace {
 namespace ops {
-namespace transpose {
 
-void TransposeNHWCToNCHWC3(const float *input,
-                           float *output,
+template<typename T>
+void TransposeNHWCToNCHWC3(utils::ThreadPool *thread_pool,
+                           const T *input,
+                           T *output,
                            const index_t height,
-                           const index_t width);
+                           const index_t width) {
+  index_t image_size = height * width;
 
-void TransposeNHWCToNCHWC3(const int *input,
-                           int *output,
-                           const index_t height,
-                           const index_t width);
+  thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t h = start; h < end; h += step) {
+      index_t in_offset = h * width * 3;
+      index_t out_offset = h * width;
 
-void TransposeNCHWToNHWCC2(const float *input,
-                           float *output,
-                           const index_t height,
-                           const index_t width);
+      for (index_t w = 0; w < width; ++w) {
+        for (index_t c = 0; c < 3; ++c) {
+          output[out_offset + c * image_size + w] =
+              input[in_offset + w * 3 + c];
+        }
+      }
+    }
+  }, 0, height, 1);
+}
+
+template<>
+inline void TransposeNHWCToNCHWC3<float>(utils::ThreadPool *thread_pool,
+                                         const float *input,
+                                         float *output,
+                                         const index_t height,
+                                         const index_t width) {
+  index_t image_size = height * width;
+
+  thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t h = start; h < end; h += step) {
+      index_t in_offset = h * width * 3;
+      index_t out_offset = h * width;
 
-void TransposeNCHWToNHWCC2(const int *input,
-                           int *output,
+#if defined(MACE_ENABLE_NEON)
+      index_t w;
+      for (w = 0; w + 3 < width; w += 4) {
+        float32x4x3_t vi = vld3q_f32(input + in_offset);
+        vst1q_f32(output + out_offset, vi.val[0]);
+        vst1q_f32(output + out_offset + image_size, vi.val[1]);
+        vst1q_f32(output + out_offset + image_size * 2, vi.val[2]);
+
+        in_offset += 12;
+        out_offset += 4;
+      }
+      for (; w < width; ++w) {
+        for (index_t c = 0; c < 3; ++c) {
+          output[h * width + image_size * c + w] =
+              input[h * width * 3 + w * 3 + c];
+        }
+      }
+#else
+      for (index_t w = 0; w < width; ++w) {
+        for (index_t c = 0; c < 3; ++c) {
+          output[out_offset + c * image_size + w] =
+              input[in_offset + w * 3 + c];
+        }
+      }
+#endif
+    }
+  }, 0, height, 1);
+}
+
+template<typename T>
+void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool,
+                           const T *input,
+                           T *output,
                            const index_t height,
-                           const index_t width);
-}  // namespace transpose
+                           const index_t width) {
+  index_t image_size = height * width;
+
+  thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t h = start; h < end; h += step) {
+      index_t in_offset = h * width;
+      index_t out_offset = h * width * 2;
+
+      for (index_t w = 0; w < width; ++w) {
+        for (index_t c = 0; c < 2; ++c) {
+          output[out_offset + w * 2 + c] =
+              input[in_offset + c * image_size + w];
+        }
+      }
+    }
+  }, 0, height, 1);
+}
 
-template <typename T>
-MaceStatus Transpose(const T *input,
+template<>
+inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
+                                         const float *input,
+                                         float *output,
+                                         const index_t height,
+                                         const index_t width) {
+  index_t image_size = height * width;
+
+  thread_pool->Compute1D([=](index_t start, index_t end, index_t step) {
+    for (index_t h = start; h < end; h += step) {
+      index_t in_offset = h * width;
+      index_t out_offset = h * width * 2;
+
+#if defined(MACE_ENABLE_NEON)
+      index_t w;
+      for (w = 0; w + 3 < width; w += 4) {
+        float32x4_t vi0 = vld1q_f32(input + in_offset);
+        float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
+        float32x4x2_t vi = {vi0, vi1};
+        vst2q_f32(output + out_offset, vi);
+        in_offset += 4;
+        out_offset += 8;
+      }
+      for (; w < width; ++w) {
+        for (index_t c = 0; c < 2; ++c) {
+          output[h * width * 2 + w * 2 + c] =
+              input[h * width + image_size * c + w];
+        }
+      }
+#else
+      for (index_t w = 0; w < width; ++w) {
+        for (index_t c = 0; c < 2; ++c) {
+          output[out_offset + w * 2 + c] =
+              input[in_offset + c * image_size + w];
+        }
+      }
+#endif
+    }
+  }, 0, height, 1);
+}
+
+template<typename T>
+MaceStatus Transpose(utils::ThreadPool *thread_pool,
+                     const T *input,
                      const std::vector<int64_t> &input_shape,
                      const std::vector<int> &dst_dims,
-                     T *output,
-                     DataType data_type = DataType::DT_FLOAT) {
+                     T *output) {
   MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
       (input_shape.size() == 4 && dst_dims.size() == 4),
              "Only support 2D or 4D transpose");
@@ -68,41 +177,43 @@ MaceStatus Transpose(const T *input,
     index_t stride_i = height;
     index_t stride_j = width;
     index_t tile_size = height > 512 || width > 512 ? 64 : 32;
-#pragma omp parallel for collapse(2)
-    for (index_t i = 0; i < height; i += tile_size) {
-      for (index_t j = 0; j < width; j += tile_size) {
-        index_t end_i = std::min(i + tile_size, height);
-        index_t end_j = std::min(j + tile_size, width);
-        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-            output[tile_j * stride_i + tile_i] =
-                input[tile_i * stride_j + tile_j];
+
+    thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
+                               index_t start1, index_t end1, index_t step1) {
+      for (index_t i = start0; i < end0; i += step0) {
+        for (index_t j = start1; j < end1; j += step1) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              output[tile_j * stride_i + tile_i] =
+                  input[tile_i * stride_j + tile_j];
+            }
           }
         }
       }
-    }
+    }, 0, height, tile_size, 0, width, tile_size);
   } else if (input_shape.size() == 4) {
     std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
     std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
     index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
-    bool supported_dt = (data_type == DataType::DT_FLOAT ||
-        data_type == DataType::DT_INT32);
 
-    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 &&
-        supported_dt) {
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
       for (index_t b = 0; b < input_shape[0]; ++b) {
-        transpose::TransposeNHWCToNCHWC3(input + b * batch_size,
-                                         output + b * batch_size,
-                                         input_shape[1],
-                                         input_shape[2]);
+        TransposeNHWCToNCHWC3(thread_pool,
+                              input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[1],
+                              input_shape[2]);
       }
     } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
-        && input_shape[1] == 2 && supported_dt) {
+        && input_shape[1] == 2) {
       for (index_t b = 0; b < input_shape[0]; ++b) {
-        transpose::TransposeNCHWToNHWCC2(input + b * batch_size,
-                                         output + b * batch_size,
-                                         input_shape[2],
-                                         input_shape[3]);
+        TransposeNCHWToNHWCC2(thread_pool,
+                              input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[2],
+                              input_shape[3]);
       }
     } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
       index_t height = input_shape[1];
@@ -114,7 +225,6 @@ MaceStatus Transpose(const T *input,
       index_t tile_size = std::max(static_cast<index_t>(1),
                                    static_cast<index_t>(std::sqrt(
                                        8 * 1024 / channel)));
-#pragma omp parallel for collapse(2)
       for (index_t i = 0; i < height; i += tile_size) {
         for (index_t j = 0; j < width; j += tile_size) {
           index_t end_i = std::min(i + tile_size, height);
@@ -163,7 +273,6 @@ MaceStatus Transpose(const T *input,
   return MaceStatus::MACE_SUCCESS;
 }
 
-
 }  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index 8d4248570f7453002cc68024cd4017208da7e284..1254c643ceee467276d4c3b7af83d6f9f9238458 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -15,7 +15,7 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #include "mace/utils/memory.h"
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index a6421f45fed1b0520e468acaae58c5439c8c03e3..5fefeddcd1c523c0da1c3f1c384119f4865b361e 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -41,6 +41,11 @@
 #include "mace/ops/arm/fp32/conv_2d_7x7.h"
 #include "mace/ops/arm/fp32/conv_2d_1xn.h"
 #include "mace/ops/arm/fp32/conv_general.h"
+#include "mace/ops/arm/fp32/bias_add.h"
+#include "mace/ops/arm/fp32/activation.h"
+#else
+#include "mace/ops/ref/activation.h"
+#include "mace/ops/ref/bias_add.h"
 #endif  // MACE_ENABLE_NEON
 
 #include "mace/ops/ref/conv_2d.h"
@@ -67,12 +72,13 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
-        activation_(ops::StringToActivationType(
+        activation_delegator_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
-                                                   "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
-        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-            "leakyrelu_coefficient", 0.0f)) {}
+                                                   "NOOP")),
+                              Operation::GetOptionalArg<float>("max_limit",
+                                                               0.0f),
+                              Operation::GetOptionalArg<float>(
+                                  "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -80,8 +86,6 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
 
-    const index_t channels = filter->dim(0);
-
 #ifdef MACE_ENABLE_NEON
     // the following params are used to decide which conv delegator to use
     const index_t stride_h = strides_[0];
@@ -91,11 +95,12 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const index_t filter_h = filter->dim(2);
     const index_t filter_w = filter->dim(3);
     const index_t input_channels = input->dim(1);
+    const index_t channels = filter->dim(0);
 
     // NOTE: delegator is fixed after first round of running,
     // although winograd depends on input params.
     // We do not support changeable filter for now.
-    if (conv2d_delegator_.get() == nullptr) {
+    if (conv2d_delegator_ == nullptr) {
       if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
           && dilation_h == 1 && dilation_w == 1) {
         conv2d_delegator_ = make_unique<arm::fp32::Conv2dK1x1>(
@@ -166,7 +171,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 
     conv2d_delegator_->Compute(context, input, filter, output);
 #else
-    if (ref_conv2d_delegator_.get() == nullptr) {
+    if (ref_conv2d_delegator_ == nullptr) {
       ref_conv2d_delegator_ = make_unique<ref::Conv2d<float>>(strides_,
                                                               dilations_,
                                                               paddings_,
@@ -175,53 +180,21 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     ref_conv2d_delegator_->Compute(context, input, filter, output);
 #endif
 
-    Tensor::MappingGuard bias_guard(bias);
-    Tensor::MappingGuard output_guard(output);
-    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-    auto output_data = output->mutable_data<float>();
-    if (bias_data != nullptr) {
-      const index_t batch = input->dim(0);
-      const index_t height = output->dim(2);
-      const index_t width = output->dim(3);
-      const index_t image_size = height * width;
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          float *output_ptr = output_data + (b * channels + c) * image_size;
-          const float bias = bias_data[c];
-#if defined(MACE_ENABLE_NEON)
-          float32x4_t vbias = vdupq_n_f32(bias);
-          for (index_t i = 0; i <= image_size - 4; i += 4) {
-            float32x4_t v = vld1q_f32(output_ptr + i);
-            v = vaddq_f32(v, vbias);
-            vst1q_f32(output_ptr + i, v);
-          }
-          for (index_t i = (image_size >> 2) << 2; i < image_size; ++i) {
-            output_ptr[i] += bias;
-          }
-#else
-          for (index_t i = 0; i < image_size; ++i) {
-            output_ptr[i] += bias;
-          }
-#endif
-        }
-      }
-    }
-
-    DoActivation(output_data, output_data, output->size(), activation_,
-                 relux_max_limit_, leakyrelu_coefficient_);
+    bias_add_delegator_.Compute(context, output, bias, output);
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-  const ActivationType activation_;
-  const float relux_max_limit_;
-  const float leakyrelu_coefficient_;
 #ifdef MACE_ENABLE_NEON
   std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
+  arm::fp32::BiasAdd bias_add_delegator_;
+  arm::fp32::Activation activation_delegator_;
 #else
   std::unique_ptr<ref::Conv2d<float>> ref_conv2d_delegator_;
+  ref::BiasAdd bias_add_delegator_;
+  ref::Activation activation_delegator_;
 #endif  // MACE_ENABLE_NEON
 
  private:
@@ -230,17 +203,17 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
         activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
+                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
         leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
-              "leakyrelu_coefficient", 0.0f)) {}
+            "leakyrelu_coefficient", 0.0f)) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
@@ -334,7 +307,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
       scratch->GrowSize(im2col_size);
       im2col = make_unique<Tensor>(scratch->Scratch(im2col_size), DT_UINT8);
       uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
-      Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
+      Im2col(context, input_data, input->shape(), filter_h, filter_w, stride_h,
              stride_w, static_cast<uint8_t>(input->zero_point()),
              paddings[0], paddings[1], output->shape(), depth, im2col_data);
       gemm_input_data = im2col_data;
@@ -366,87 +339,98 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
   }
 
  private:
-  template <typename T>
+  template<typename T>
   inline void Im2col(
+      const OpContext *context,
       const T *in_data, const std::vector<index_t> &in_shape,
       const index_t filter_h, const index_t filter_w, const index_t stride_h,
       const index_t stride_w, const T zero_point, const int pad_height,
       const int pad_width, const std::vector<index_t> &out_shape,
-      const index_t depth, T* im2col_data) {
+      const index_t depth, T *im2col_data) {
     const index_t input_row_size = in_shape[2] * in_shape[3];
     const index_t patch_row_size = filter_w * in_shape[3];
 
-#pragma omp parallel for collapse(3) schedule(runtime)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          // Reshape a patch of input to column, which is corresponding to
-          // a column of output(:, column).
-          const index_t ih_begin = h * stride_h - (pad_height >> 1);
-          const index_t ih_end = ih_begin + filter_h;
-          const index_t iw_begin = w * stride_w - (pad_width >> 1);
-          const index_t iw_end = iw_begin + filter_w;
-          // gate height and width to separate padding
-          const index_t ih_begin_gated = std::max<index_t>(0, ih_begin);
-          const index_t ih_end_gated = std::min<index_t>(ih_end, in_shape[1]);
-          const index_t iw_begin_gated = std::max<index_t>(0, iw_begin);
-          const index_t iw_end_gated = std::min<index_t>(iw_end, in_shape[2]);
-          const index_t pad_top = std::max<index_t>(0, -ih_begin);
-          const index_t pad_bottom = ih_end - ih_end_gated;
-          const index_t pad_left = std::max<index_t>(0, -iw_begin);
-          const index_t pad_right = iw_end - iw_end_gated;
-          index_t im2col_column_offset =
-              ((b * out_shape[1] + h) * out_shape[2] + w) * depth;
-
-          // fill in padding top
-          if (pad_top > 0) {
-            std::fill_n(im2col_data + im2col_column_offset,
-                        pad_top * patch_row_size, zero_point);
-          }
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t h = start1; h < end1; h += step1) {
+          for (index_t w = start2; w < end2; w += step2) {
+            // Reshape a patch of input to column, which is corresponding to
+            // a column of output(:, column).
+            const index_t ih_begin = h * stride_h - (pad_height >> 1);
+            const index_t ih_end = ih_begin + filter_h;
+            const index_t iw_begin = w * stride_w - (pad_width >> 1);
+            const index_t iw_end = iw_begin + filter_w;
+            // gate height and width to separate padding
+            const index_t ih_begin_gated = std::max<index_t>(0, ih_begin);
+            const index_t ih_end_gated = std::min<index_t>(ih_end, in_shape[1]);
+            const index_t iw_begin_gated = std::max<index_t>(0, iw_begin);
+            const index_t iw_end_gated = std::min<index_t>(iw_end, in_shape[2]);
+            const index_t pad_top = std::max<index_t>(0, -ih_begin);
+            const index_t pad_bottom = ih_end - ih_end_gated;
+            const index_t pad_left = std::max<index_t>(0, -iw_begin);
+            const index_t pad_right = iw_end - iw_end_gated;
+            index_t im2col_column_offset =
+                ((b * out_shape[1] + h) * out_shape[2] + w) * depth;
+
+            // fill in padding top
+            if (pad_top > 0) {
+              std::fill_n(im2col_data + im2col_column_offset,
+                          pad_top * patch_row_size, zero_point);
+            }
 
-          const index_t patch_row_size_gated =
-              std::min(filter_w - pad_left,
-                       in_shape[2] - iw_begin_gated) * in_shape[3];
-          MACE_CHECK(patch_row_size_gated ==
-              ((filter_w - (pad_left + pad_right)) * in_shape[3]));
-          const index_t pad_left_size = pad_left * in_shape[3];
-          const index_t pad_right_size = pad_right * in_shape[3];
-          index_t im2col_offset = im2col_column_offset +
-              (pad_top * filter_w + pad_left) * in_shape[3];
-          index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2]
-              + iw_begin_gated) * in_shape[3];
-
-          // fill in effective rows
-          for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) {
-            // fill in padding left
-            if (pad_left > 0) {
-              const index_t left_offset = im2col_offset - pad_left_size;
-              std::fill_n(im2col_data + left_offset, pad_left_size, zero_point);
+            const index_t patch_row_size_gated =
+                std::min(filter_w - pad_left,
+                         in_shape[2] - iw_begin_gated) * in_shape[3];
+            MACE_CHECK(patch_row_size_gated ==
+                ((filter_w - (pad_left + pad_right)) * in_shape[3]));
+            const index_t pad_left_size = pad_left * in_shape[3];
+            const index_t pad_right_size = pad_right * in_shape[3];
+            index_t im2col_offset = im2col_column_offset +
+                (pad_top * filter_w + pad_left) * in_shape[3];
+            index_t
+                in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2]
+                + iw_begin_gated) * in_shape[3];
+
+            // fill in effective rows
+            for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) {
+              // fill in padding left
+              if (pad_left > 0) {
+                const index_t left_offset = im2col_offset - pad_left_size;
+                std::fill_n(im2col_data + left_offset,
+                            pad_left_size,
+                            zero_point);
+              }
+              // copy effective data
+              std::copy_n(in_data + in_offset, patch_row_size_gated,
+                          im2col_data + im2col_offset);
+              // fill in padding right
+              if (pad_right > 0) {
+                const index_t
+                    right_offset = im2col_offset + patch_row_size_gated;
+                std::fill_n(im2col_data + right_offset, pad_right_size,
+                            zero_point);
+              }
+              in_offset += input_row_size;
+              im2col_offset += patch_row_size;
             }
-            // copy effective data
-            std::copy_n(in_data + in_offset, patch_row_size_gated,
-                        im2col_data + im2col_offset);
-            // fill in padding right
-            if (pad_right > 0) {
-              const index_t right_offset = im2col_offset + patch_row_size_gated;
-              std::fill_n(im2col_data + right_offset, pad_right_size,
+
+            // fill in padding bottom
+            if (pad_bottom > 0) {
+              const index_t pad_bottom_size = pad_bottom * patch_row_size;
+              const index_t bottom_offset =
+                  im2col_column_offset + depth - pad_bottom_size;
+              std::fill_n(im2col_data + bottom_offset, pad_bottom_size,
                           zero_point);
             }
-            in_offset += input_row_size;
-            im2col_offset += patch_row_size;
-          }
-
-          // fill in padding bottom
-          if (pad_bottom > 0) {
-            const index_t pad_bottom_size = pad_bottom * patch_row_size;
-            const index_t bottom_offset =
-                im2col_column_offset + depth - pad_bottom_size;
-            std::fill_n(im2col_data + bottom_offset, pad_bottom_size,
-                        zero_point);
           }
         }
       }
-    }
+    }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1);
   }
 
  private:
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 49d11700a19668082a43efe8008f07ae8123acb4..7fb854787c032a5106c065d92830729d8243e9a1 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -1172,7 +1172,8 @@ void TestQuant(const index_t batch,
   auto bias_data = bias->data<float>();
   float bias_scale = q_input->scale() * q_filter->scale();
   std::vector<int32_t> q_bias(bias->size());
-  QuantizeWithScaleAndZeropoint(
+  QuantizeUtil<int32_t> quantize_util(OpTestContext::Get()->thread_pool());
+  quantize_util.QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), bias_scale, 0, q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
       "QuantizedBias", {out_channels}, q_bias, true, bias_scale, 0);
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index 3dda169dd80f02a258d854ce88c7f511beab0167..7265208efdd3d62d682c1689b82049ce2dd42e07 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -90,7 +90,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
     const index_t in_img_size =
         input_shape[1] * input_shape[2] * input_shape[3];
     const index_t in_hw = input_shape[2] * input_shape[3];
-#pragma omp parallel for collapse(3)
+
     for (int b = 0; b < output_shape[0]; ++b) {
       for (int c = 0; c < output_shape[1]; ++c) {
         for (int h = 0; h < output_shape[2]; ++h) {
diff --git a/mace/ops/cumsum.cc b/mace/ops/cumsum.cc
index f0117270c80ce25bda50ab8e8461302b521c484e..302fdfd585f4a16a7da42ebe1fd495c4f0ce9b6e 100644
--- a/mace/ops/cumsum.cc
+++ b/mace/ops/cumsum.cc
@@ -78,7 +78,6 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
     const index_t cum_size = input_shape[axis_];
 
     if (!reverse_) {
-#pragma omp parallel for
       for (index_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
         index_t start_idx = outer_idx * cum_size * inner_size;
         for (index_t cum_idx = 0; cum_idx < cum_size; ++cum_idx) {
@@ -105,7 +104,6 @@ class CumsumOp<DeviceType::CPU, T> : public Operation {
         }
       }
     } else {
-#pragma omp parallel for
       for (index_t outer_idx = outer_size - 1; outer_idx >= 0; --outer_idx) {
         index_t start_idx = outer_idx * cum_size * inner_size;
         for (index_t cum_idx = cum_size - 1; cum_idx >= 0; --cum_idx) {
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 6e9a0fa8db36209887f86d0fdc75d5c5d1a5c2bc..5692425ad10ba05f92fdf06c428106bdf15455a9 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -16,6 +16,16 @@
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
+#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
+#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
+#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
+#include "mace/ops/arm/fp32/deconv_2d_general.h"
+#include "mace/ops/arm/fp32/bias_add.h"
+#include "mace/ops/arm/fp32/activation.h"
+#else
+#include "mace/ops/ref/bias_add.h"
+#include "mace/ops/ref/activation.h"
+#include "mace/ops/ref/deconv_2d.h"
 #endif
 
 #include <algorithm>
@@ -27,9 +37,10 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
-#include "mace/ops/arm/deconv_2d_neon.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/deconv_2d.h"
@@ -38,21 +49,24 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class Deconv2dOp;
 
-template <>
+template<>
 class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
  public:
   explicit Deconv2dOp(OpConstructContext *context)
-      : Deconv2dOpBase(context) {}
+      : Deconv2dOpBase(context),
+        activation_delegator_(activation_,
+                              relux_max_limit_,
+                              leakyrelu_coefficient_) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
     const Tensor *filter = this->Input(1);
     const Tensor *bias = nullptr;
     const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == ops::CAFFE) {
+    if (model_type_ == CAFFE) {
       bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
     } else {
       output_shape_tensor =
@@ -65,91 +79,9 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-    std::vector<int> in_paddings(2, 0);
-    std::vector<int> out_paddings(2, 0);
-    std::vector<index_t> out_shape(4, 0);
-    std::vector<index_t> padded_out_shape(4, 0);
-
-    if (model_type_ == FrameworkType::TENSORFLOW) {  // tensorflow
-      MACE_CHECK_NOTNULL(output_shape_tensor);
-      MACE_CHECK(output_shape_tensor->size() == 4);
-      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
-      auto output_shape_data =
-          output_shape_tensor->data<int32_t>();
-      out_shape =
-          std::vector<index_t>(output_shape_data, output_shape_data + 4);
-
-      const index_t t = out_shape[1];
-      out_shape[1] = out_shape[3];
-      out_shape[3] = out_shape[2];
-      out_shape[2] = t;
-
-      CalcDeconvShape_TF(
-          input->shape().data(),
-          filter->shape().data(),
-          out_shape.data(),
-          strides_.data(),
-          1,
-          padding_type_,
-          in_paddings.data(),
-          out_paddings.data(),
-          padded_out_shape.data(),
-          true);
-    } else {  // caffe
-      if (!paddings_.empty()) out_paddings = paddings_;
-      CalcDeconvShape_Caffe(
-          input->shape().data(),
-          filter->shape().data(),
-          strides_.data(),
-          out_paddings.data(),
-          1,
-          in_paddings.data(),
-          out_shape.data(),
-          padded_out_shape.data(),
-          true);
-    }
-    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-    output->Clear();
-    index_t kernel_h = filter->dim(2);
-    index_t kernel_w = filter->dim(3);
-    const index_t *in_shape = input->shape().data();
-
-    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
-               out_shape[1]);
-    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
-               in_shape[1]);
-    MACE_CHECK(in_shape[0] == out_shape[0],
-               "Input/Output batch size mismatch");
-    std::function<void(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *output_shape,
-                       float *output)> deconv_func;
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard filter_mapper(filter);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
-    auto input_data = input->data<float>();
-    auto filter_data = filter->data<float>();
-    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-    auto output_data = output->mutable_data<float>();
-
-    const index_t pad_h = out_paddings[0] / 2;
-    const index_t pad_w = out_paddings[1] / 2;
-
-    index_t padded_out_size =
-        std::accumulate(padded_out_shape.begin(),
-                        padded_out_shape.end(),
-                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(padded_out_size);
-    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
-    padded_out.Reshape(padded_out_shape);
-    padded_out.Clear();
-    auto *padded_out_data = padded_out.mutable_data<float>();
+#ifdef MACE_ENABLE_NEON
+    const index_t kernel_h = filter->dim(2);
+    const index_t kernel_w = filter->dim(3);
 
     bool use_neon_2x2_s1 = kernel_h == kernel_w && kernel_h == 2 &&
         strides_[0] == strides_[1] && strides_[0] == 1;
@@ -166,197 +98,76 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
     bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
         strides_[0] == strides_[1] && strides_[0] == 2;
 
-    if (use_neon_2x2_s1) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK2x2S1(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else if (use_neon_2x2_s2) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK2x2S2(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else if (use_neon_3x3_s1) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK3x3S1(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else if (use_neon_3x3_s2) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK3x3S2(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else if (use_neon_4x4_s1) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK4x4S1(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else if (use_neon_4x4_s2) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dNeonK4x4S2(input,
-                           filter,
-                           input_shape,
-                           padded_output_shape,
-                           padded_output);
-      };
-    } else {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *input_shape,
-                        const index_t *padded_output_shape,
-                        float *padded_output) {
-        Deconv2dGeneral(input,
-                        filter,
-                        kernel_h,
-                        kernel_w,
-                        strides_.data(),
-                        input_shape,
-                        padded_output_shape,
-                        padded_output);
-      };
+    if (deconv2d_delegator_ == nullptr) {
+      if (use_neon_2x2_s1) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S1>(
+            paddings_, padding_type_, model_type_);
+      } else if (use_neon_2x2_s2) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK2x2S2>(
+            paddings_, padding_type_, model_type_);
+      } else if (use_neon_3x3_s1) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S1>(
+            paddings_, padding_type_, model_type_);
+      } else if (use_neon_3x3_s2) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK3x3S2>(
+            paddings_, padding_type_, model_type_);
+      } else if (use_neon_4x4_s1) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S1>(
+            paddings_, padding_type_, model_type_);
+      } else if (use_neon_4x4_s2) {
+        deconv2d_delegator_ = make_unique<arm::fp32::Deconv2dK4x4S2>(
+            paddings_, padding_type_, model_type_);
+      } else {
+        deconv2d_delegator_ =
+            make_unique<arm::fp32::Deconv2dGeneral>(strides_,
+                                                    std::vector<int>{1, 1},
+                                                    paddings_,
+                                                    padding_type_,
+                                                    model_type_);
+      }
     }
-
-    bool no_pad =
-        (padded_out_shape[2] == out_shape[2]) &&
-            (padded_out_shape[3] == out_shape[3]);
-    float *out_data = no_pad ? output_data : padded_out_data;
-
-    deconv_func(input_data,
-                filter_data,
-                in_shape,
-                padded_out_shape.data(),
-                out_data);
-    if (!no_pad) {
-      CropPadOut<float>(out_data,
-                        padded_out_shape.data(),
-                        out_shape.data(),
-                        pad_h,
-                        pad_w,
-                        output_data);
+    deconv2d_delegator_->Compute(context,
+                                 input,
+                                 filter,
+                                 output_shape_tensor,
+                                 output);
+#else
+    if (deconv2d_delegator_ == nullptr) {
+      deconv2d_delegator_ = make_unique<ref::Deconv2d<float>>(strides_,
+                                                              std::vector<int>{
+                                                                  1, 1},
+                                                              paddings_,
+                                                              padding_type_,
+                                                              model_type_);
     }
+    deconv2d_delegator_->Compute(context,
+                                 input,
+                                 filter,
+                                 output_shape_tensor,
+                                 output);
 
-    if (bias_data != nullptr) {
-      const index_t batch = out_shape[0];
-      const index_t channels = out_shape[1];
-      const index_t img_size = out_shape[2] * out_shape[3];
-#pragma omp parallel for collapse(3) schedule(runtime)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t i = 0; i < img_size; ++i) {
-            output_data[(b * channels + c) * img_size + i] +=
-                bias_data[c];
-          }
-        }
-      }
-    }
+#endif  // MACE_ENABLE_NEON
 
-    DoActivation<float>(output_data,
-                        output_data,
-                        output->size(),
-                        activation_,
-                        relux_max_limit_,
-                        leakyrelu_coefficient_);
+    bias_add_delegator_.Compute(context, output, bias, output);
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-  void Deconv2dGeneral(const float *input,
-                       const float *filter,
-                       const index_t kernel_h,
-                       const index_t kernel_w,
-                       const int *strides,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-    const index_t out_img_size = out_height * out_width;
-    const index_t in_img_size = in_height * in_width;
-
-    const int kernel_size = static_cast<int>(kernel_h * kernel_w);
-    std::vector<index_t> index_map(kernel_size, 0);
-    for (index_t i = 0; i < kernel_h; ++i) {
-      for (index_t j = 0; j < kernel_w; ++j) {
-        index_map[i * kernel_w + j] = i * out_width + j;
-      }
-    }
-
-    const index_t batch = in_shape[0];
-    const index_t out_channels = out_shape[1];
-    const index_t in_channels = in_shape[1];
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (int b = 0; b < batch; ++b) {
-      for (int oc = 0; oc < out_channels; ++oc) {
-        float *out_base =
-            output + (b * out_channels + oc) * out_img_size;
-        for (int i = 0; i < in_height; ++i) {
-          for (int j = 0; j < in_width; ++j) {
-            const index_t out_offset =
-                i * strides[0] * out_width + j * strides[1];
-            for (int ic = 0; ic < in_channels; ++ic) {
-              const index_t input_idx =
-                  (b * in_channels + ic) * in_img_size + i * in_width + j;
-              const float val = input[input_idx];
-              const index_t kernel_offset =
-                  (oc * in_channels + ic) * kernel_size;
-              for (int k = 0; k < kernel_size; ++k) {
-                const index_t out_idx = out_offset + index_map[k];
-                const index_t kernel_idx = kernel_offset + k;
-                out_base[out_idx] += val * filter[kernel_idx];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
+#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<arm::fp32::Deconv2dBase> deconv2d_delegator_;
+  arm::fp32::BiasAdd bias_add_delegator_;
+  arm::fp32::Activation activation_delegator_;
+#else
+  ref::BiasAdd bias_add_delegator_;
+  ref::Activation activation_delegator_;
+  std::unique_ptr<ref::Deconv2d<float>> deconv2d_delegator_;
+#endif  // MACE_ENABLE_NEON
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<typename T>
 class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  public:
   explicit Deconv2dOp(OpConstructContext *context)
@@ -394,7 +205,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
     const Tensor *filter = this->Input(1);
     const Tensor *bias = nullptr;
     const Tensor *output_shape_tensor = nullptr;
-    if (model_type_ == ops::CAFFE) {
+    if (model_type_ == CAFFE) {
       bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
     } else {
       output_shape_tensor =
@@ -407,41 +218,30 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-    std::vector<int> in_paddings(2, 0);
-    std::vector<index_t> out_shape(4, 0);
-
-    if (model_type_ == FrameworkType::TENSORFLOW) {
-      MACE_CHECK_NOTNULL(output_shape_tensor);
-      MACE_CHECK(output_shape_tensor->size() == 4);
-      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
-      auto output_shape_data =
-          output_shape_tensor->data<int32_t>();
+    std::vector<index_t> out_shape;
+    if (output_shape_tensor) {
+      Tensor::MappingGuard out_shape_guard(output_shape_tensor);
+      MACE_CHECK(output_shape_tensor->size() == 4,
+                 "output shape should be 4-dims");
       out_shape =
-          std::vector<index_t>(output_shape_data, output_shape_data + 4);
-
-      CalcDeconvShape_TF(
-          input->shape().data(),
-          filter->shape().data(),
-          out_shape.data(),
-          strides_.data(),
-          1,
-          padding_type_,
-          in_paddings.data(),
-          nullptr,
-          nullptr);
-    } else {
-      std::vector<int> out_paddings(2, 0);
-      if (!paddings_.empty()) out_paddings = paddings_;
-      CalcDeconvShape_Caffe(
-          input->shape().data(),
-          filter->shape().data(),
-          strides_.data(),
-          out_paddings.data(),
-          1,
-          in_paddings.data(),
-          out_shape.data(),
-          nullptr);
+          std::vector<index_t>(output_shape_tensor->data<int32_t>(),
+                               output_shape_tensor->data<int32_t>() + 4);
     }
+    std::vector<int> in_paddings;
+    std::vector<int> out_paddings;
+
+    CalDeconvOutputShapeAndPadSize(input->shape(),
+                                   filter->shape(),
+                                   strides_,
+                                   padding_type_,
+                                   paddings_,
+                                   1,
+                                   &out_shape,
+                                   &in_paddings,
+                                   &out_paddings,
+                                   nullptr,
+                                   model_type_,
+                                   NHWC);
 
     return kernel_->Compute(context, input, filter, bias,
                             strides_.data(), in_paddings.data(), activation_,
@@ -454,7 +254,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterDeconv2D(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
index 008c6a5b5ea2cb9cc14c7c40940206e81c4f7aed..50a2ecee5e8329ea24aa3fbae419823831d1b370 100644
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -27,11 +27,6 @@
 namespace mace {
 namespace ops {
 
-enum FrameworkType {
-  TENSORFLOW = 0,
-  CAFFE = 1,
-};
-
 class Deconv2dOpBase : public Operation {
  public:
   explicit Deconv2dOpBase(OpConstructContext *context)
@@ -41,7 +36,7 @@ class Deconv2dOpBase : public Operation {
             "padding", static_cast<int>(SAME)))),
         paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
         group_(Operation::GetOptionalArg<int>("group", 1)),
-        model_type_(static_cast<ops::FrameworkType>(
+        model_type_(static_cast<FrameworkType>(
                         Operation::GetOptionalArg<int>("framework_type", 0))),
         activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
@@ -51,140 +46,6 @@ class Deconv2dOpBase : public Operation {
         leakyrelu_coefficient_(
             Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f)) {}
 
-  static void CalcDeconvShape_Caffe(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int *strides,
-      const int *out_paddings,
-      const int group,
-      int *in_paddings,
-      index_t *out_shape,
-      index_t *padded_out_shape,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(out_paddings);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t output_channel = filter_shape[0] * group;
-
-    const index_t kernel_h = filter_shape[2];
-    const index_t kernel_w = filter_shape[3];
-
-    index_t padded_out_height =
-        (in_height - 1) * strides[0] + kernel_h;
-    index_t padded_out_width =
-        (in_width - 1) * strides[1] + kernel_w;
-
-    if (in_paddings != nullptr) {
-      in_paddings[0] = static_cast<int>((kernel_h - 1) * 2 - out_paddings[0]);
-      in_paddings[1] = static_cast<int>((kernel_w - 1) * 2 - out_paddings[1]);
-      in_paddings[0] = std::max<int>(0, in_paddings[0]);
-      in_paddings[1] = std::max<int>(0, in_paddings[1]);
-    }
-
-    if (padded_out_shape != nullptr) {
-      padded_out_shape[0] = input_shape[0];
-      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
-      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
-      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
-    }
-
-    if (out_shape != nullptr) {
-      index_t out_height = padded_out_height - out_paddings[0];
-      index_t out_width = padded_out_width - out_paddings[1];
-      out_shape[0] = input_shape[0];
-      out_shape[1] = isNCHW ? output_channel : out_height;
-      out_shape[2] = isNCHW ? out_height : out_width;
-      out_shape[3] = isNCHW ? out_width : output_channel;
-    }
-  }
-
-  static void CalcDeconvShape_TF(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const index_t *output_shape,
-      const int *strides,
-      const int group,
-      Padding padding_type,
-      int *in_paddings,
-      int *out_paddings,
-      index_t *padded_out_shape,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(output_shape);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
-    const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
-
-    const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
-    const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
-
-    const index_t kernel_h = filter_shape[2];
-    const index_t kernel_w = filter_shape[3];
-
-    index_t expected_input_height = 0, expected_input_width = 0;
-
-    switch (padding_type) {
-      case VALID:
-        expected_input_height =
-            (out_height - kernel_h + strides[0]) / strides[0];
-        expected_input_width =
-            (out_width - kernel_w + strides[1]) / strides[1];
-        break;
-      case SAME:
-        expected_input_height =
-            (out_height + strides[0] - 1) / strides[0];
-        expected_input_width =
-            (out_width + strides[1] - 1) / strides[1];
-        break;
-      default:
-        MACE_CHECK(false, "Unsupported padding type: ", padding_type);
-    }
-
-    MACE_CHECK(expected_input_height == in_height,
-               expected_input_height, "!=", in_height);
-    MACE_CHECK(expected_input_width == in_width,
-               expected_input_width, "!=", in_width);
-
-    const index_t padded_out_height =
-        (in_height - 1) * strides[0] + kernel_h;
-    const index_t padded_out_width =
-        (in_width - 1) * strides[1] + kernel_w;
-
-    if (in_paddings != nullptr) {
-      const int p_h =
-          static_cast<int>(out_height + kernel_h - 1 - extended_in_height);
-      const int p_w =
-          static_cast<int>(out_width + kernel_w - 1 - extended_in_width);
-      in_paddings[0] = std::max<int>(0, p_h);
-      in_paddings[1] = std::max<int>(0, p_w);
-    }
-
-    if (out_paddings != nullptr) {
-      const int o_p_h = static_cast<int>(padded_out_height - out_height);
-      const int o_p_w = static_cast<int>(padded_out_width - out_width);
-      out_paddings[0] = std::max<int>(0, o_p_h);
-      out_paddings[1] = std::max<int>(0, o_p_w);
-    }
-
-    if (padded_out_shape != nullptr) {
-      index_t output_channel = filter_shape[0] * group;
-      padded_out_shape[0] = output_shape[0];
-      padded_out_shape[1] = isNCHW ? output_channel : padded_out_height;
-      padded_out_shape[2] = isNCHW ? padded_out_height : padded_out_width;
-      padded_out_shape[3] = isNCHW ? padded_out_width : output_channel;
-    }
-  }
-
  protected:
   std::vector<int> strides_;  // [stride_h, stride_w]
   const Padding padding_type_;
@@ -196,34 +57,6 @@ class Deconv2dOpBase : public Operation {
   const float leakyrelu_coefficient_;
 };
 
-template <typename T>
-void CropPadOut(const T *input,
-                const index_t *in_shape,
-                const index_t *out_shape,
-                const index_t pad_h,
-                const index_t pad_w,
-                T *output) {
-  const index_t batch = in_shape[0];
-  const index_t channel = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-#pragma omp parallel for collapse(3)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channel; ++j) {
-      for (int k = 0; k < out_height; ++k) {
-        const T *input_base =
-            input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
-        T *output_base =
-            output + ((i * channel + j) * out_height + k)* out_width;
-        memcpy(output_base, input_base + pad_w, out_width * sizeof(T));
-      }
-    }
-  }
-}
-
 }  // namespace ops
 }  // namespace mace
 
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index d8a1c621a49656a845319e1c849b9037e618fec4..25aa7eeeeed80e6403c125ec101a95c536eebe2c 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -38,7 +38,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                    const std::vector<float> &filter_data,
                    const std::vector<index_t> &expected_shape,
                    const std::vector<float> &expected_data,
-                   ops::FrameworkType model_type) {
+                   FrameworkType model_type) {
   OpsTestNet net;
   // Add input data
   const index_t out_channels = filter_shape[2];
@@ -49,7 +49,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
   // TODO(liutuo): remove the unused transform
   net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
   if (D == DeviceType::GPU) {
-    if (model_type == ops::FrameworkType::CAFFE) {
+    if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("Input")
           .Input("FilterOIHW")
@@ -80,7 +80,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
 
-    if (model_type == ops::FrameworkType::CAFFE) {
+    if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputNCHW")
           .Input("FilterOIHW")
@@ -128,7 +128,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7,
                     6.5, 6.6, 6.7, 9.5, 9.6, 9.7, 6.5, 6.6, 6.7,
                     4.5, 4.6, 4.7, 6.5, 6.6, 6.7, 4.5, 4.6, 4.7},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                    1, Padding::VALID, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -137,7 +137,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {1, 3, 3, 3},
                    {4, 4, 4, 6, 6, 6, 4, 4, 4, 6, 6, 6, 9, 9,
                     9, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 4, 4},
-                   ops::FrameworkType::CAFFE);
+                   FrameworkType::CAFFE);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    1, Padding::SAME, {},
                    {1, 3, 3, 3}, {3, 3, 3, 1},
@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {54,  66,  78,  126, 147, 168, 130, 146, 162,
                     198, 225, 252, 405, 450, 495, 366, 399, 432,
                     354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    1, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME_S1() {
                    {54,  66,  78,  126, 147, 168, 130, 146, 162,
                     198, 225, 252, 405, 450, 495, 366, 399, 432,
                     354, 378, 402, 630, 669, 708, 502, 530, 558},
-                   ops::FrameworkType::CAFFE);
+                   FrameworkType::CAFFE);
 }
 
 template <DeviceType D>
@@ -175,7 +175,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0},
                    2, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -188,7 +188,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     2, 2, 2, 4, 4, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
                     1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1},
-                   ops::FrameworkType::CAFFE);
+                   FrameworkType::CAFFE);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    2, Padding::SAME, {},
                    {1, 6, 6, 3}, {3, 3, 3, 1},
@@ -206,7 +206,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     83, 94, 105, 116, 127, 138, 252, 276, 300, 142, 155, 168,
                     304, 332, 360, 168, 183, 198, 70, 77, 84, 91, 98, 105, 192,
                     207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
   RunTestSimple<D>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0},
                    2, Padding::SAME, {2, 2},
                    {0}, {3, 3, 3, 1},
@@ -219,7 +219,7 @@ void TestNHWCSimple3x3SAME_S2() {
                     140, 151, 162, 78, 84, 90, 116, 127, 138, 252, 276, 300,
                     142, 155, 168, 304, 332, 360, 168, 183, 198, 91, 98, 105,
                     192, 207, 222, 104, 112, 120, 218, 235, 252, 117, 126, 135},
-                   ops::FrameworkType::CAFFE);
+                   FrameworkType::CAFFE);
 }
 
 template <DeviceType D>
@@ -236,7 +236,7 @@ void TestNHWCSimple3x3SAME_S2_1() {
                     18, 18, 18, 45, 45, 45, 27, 27, 27, 45, 45, 45, 18, 18, 18,
                     30, 30, 30, 75, 75, 75, 45, 45, 45, 75, 75, 75, 30, 30, 30,
                     12, 12, 12, 30, 30, 30, 18, 18, 18, 30, 30, 30, 12, 12, 12},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -261,7 +261,7 @@ void TestNHWCSimple3x3VALID_S2() {
                     1, 1, 1,
                     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1,
                     1, 1, 1},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -278,7 +278,7 @@ void TestNHWCSimple3x3VALID_S1() {
                     366, 399, 432, 234, 252, 270, 146, 157, 168, 354, 378, 402,
                     630, 669, 708, 502, 530, 558, 294, 309, 324, 133, 140, 147,
                     306, 321, 336, 522, 546, 570, 398, 415, 432, 225, 234, 243},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -287,7 +287,7 @@ void TestNHWCSimple2x2SAME() {
                    {1, 2, 2, 1}, {3, 3, 1, 1},
                    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
                    {1, 2, 2, 1}, {4.f, 4.f, 4.f, 4.f},
-                   ops::FrameworkType::TENSORFLOW);
+                   FrameworkType::TENSORFLOW);
 }
 
 template <DeviceType D>
@@ -298,7 +298,7 @@ void TestNHWCSimple2x2VALID() {
       {1, 5, 5, 1},
       {1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f, 2.f, 2.f, 4.f,
        2.f, 2.f, 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f},
-      ops::FrameworkType::TENSORFLOW);
+      FrameworkType::TENSORFLOW);
 }
 }  // namespace
 
@@ -388,11 +388,11 @@ void TestComplexDeconvNxN(const int batch,
     std::vector<int> paddings;
     std::vector<int> output_shape;
 
-    ops::FrameworkType model_type =
+    FrameworkType model_type =
         padding < 0 ?
-        ops::FrameworkType::TENSORFLOW : ops::FrameworkType::CAFFE;
+        FrameworkType::TENSORFLOW : FrameworkType::CAFFE;
 
-    if (model_type == ops::FrameworkType::TENSORFLOW) {
+    if (model_type == FrameworkType::TENSORFLOW) {
       if (type == Padding::SAME) {
         out_h = (height - 1) * stride_h + 1;
         out_w = (width - 1) * stride_w + 1;
@@ -410,7 +410,7 @@ void TestComplexDeconvNxN(const int batch,
       paddings.push_back(padding);
     }
 
-    if (model_type == ops::FrameworkType::CAFFE) {
+    if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputNCHW")
           .Input("Filter")
@@ -448,7 +448,7 @@ void TestComplexDeconvNxN(const int batch,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    if (model_type == ops::FrameworkType::CAFFE) {
+    if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("Input")
           .Input("Filter")
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index 2460d75a258068c4e0f08576311bf93ace6b3289..09208e7abf1194455450cb038343b0e79c65891f 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -58,7 +58,6 @@ class DepthToSpaceOp : public Operation {
     const T *input_ptr = input->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
-#pragma omp parallel for schedule(runtime)
     for (index_t b = 0; b < batch_size; ++b) {
       for (index_t d = 0; d < output_depth; ++d) {
         for (index_t h = 0; h < output_height; ++h) {
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 24b5d4192265a35397c54cc58e009e870943ad64..522a3b357ed24f5804a9bb2d4af41f8605e644a2 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -21,6 +21,11 @@
 
 #if defined(MACE_ENABLE_NEON)
 #include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
+#include "mace/ops/arm/fp32/bias_add.h"
+#include "mace/ops/arm/fp32/activation.h"
+#else
+#include "mace/ops/ref/activation.h"
+#include "mace/ops/ref/bias_add.h"
 #endif  // MACE_ENABLE_NEON
 
 #ifdef MACE_ENABLE_QUANTIZE
@@ -36,7 +41,7 @@
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/depthwise_conv2d.h"
@@ -69,7 +74,10 @@ template<>
 class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
  public:
   explicit DepthwiseConv2dOp(OpConstructContext *context)
-      : DepthwiseConv2dOpBase(context) {}
+      : DepthwiseConv2dOpBase(context),
+        activation_delegator_(activation_,
+                              relux_max_limit_,
+                              leakyrelu_coefficient_) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -129,30 +137,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
     ref_conv2d_delegator_->Compute(context, input, filter, output);
 #endif  // MACE_ENABLE_NEON
 
-    Tensor::MappingGuard bias_guard(bias);
-    Tensor::MappingGuard output_guard(output);
-    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-    auto output_data = output->mutable_data<float>();
-
-    const index_t batch = output->dim(0);
-    const index_t channels = output->dim(1);
-    const index_t height = output->dim(2);
-    const index_t width = output->dim(3);
-
-    if (bias_data != nullptr) {
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t i = 0; i < height * width; ++i) {
-            output_data[(b * channels + c) * height * width + i] +=
-                bias_data[c];
-          }
-        }
-      }
-    }
-
-    DoActivation(output_data, output_data, output->size(), activation_,
-                 relux_max_limit_, leakyrelu_coefficient_);
+    bias_add_delegator_.Compute(context, output, bias, output);
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
@@ -160,6 +146,11 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
  private:
 #ifdef MACE_ENABLE_NEON
   std::unique_ptr<arm::fp32::Conv2dBase> conv2d_delegator_;
+  arm::fp32::BiasAdd bias_add_delegator_;
+  arm::fp32::Activation activation_delegator_;
+#else
+  ref::BiasAdd bias_add_delegator_;
+  ref::Activation activation_delegator_;
 #endif  // MACE_ENABLE_NEON
   std::unique_ptr<ref::DepthwiseConv2d<float>> ref_conv2d_delegator_;
 
@@ -169,7 +160,7 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
     : public DepthwiseConv2dOpBase {
  public:
@@ -269,7 +260,7 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
       float output_multiplier =
           input->scale() * filter->scale() / output->scale();
       const int pad_hw[2] = {pad_top, pad_left};
-      DepthwiseConv2dGeneral(
+      DepthwiseConv2dGeneral(context,
           input_data, filter_data, bias_data, input->shape().data(),
           output_shape.data(), filter->shape().data(), input->zero_point(),
           filter->zero_point(), output->zero_point(), output_multiplier,
@@ -279,7 +270,8 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
     return MaceStatus::MACE_SUCCESS;
   }
  private:
-  void DepthwiseConv2dGeneral(const uint8_t *input,
+  void DepthwiseConv2dGeneral(const OpContext *context,
+                              const uint8_t *input,
                               const uint8_t *filter,
                               const int32_t *bias,
                               const index_t *in_shape,
@@ -293,54 +285,60 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
                               const int *dilation_hw,
                               const int *pad_hw,
                               uint8_t *output) {
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          for (index_t m = 0; m < out_shape[3]; ++m) {
-            const index_t filter_height = filter_shape[0];
-            const index_t filter_width = filter_shape[1];
-            const index_t in_channels = filter_shape[2];
-            const index_t depth_multiplier = filter_shape[3];
-            const index_t in_height = in_shape[1];
-            const index_t in_width = in_shape[2];
-            const index_t out_height = out_shape[1];
-            const index_t out_width = out_shape[2];
-            const index_t out_channels = out_shape[3];
-            index_t out_offset =
-                ((b * out_height + h) * out_width + w) * out_channels + m;
-            index_t c = m / depth_multiplier;
-            index_t o = m % depth_multiplier;
-            index_t ih_base = h * stride_hw[0] - pad_hw[0];
-            index_t iw_base = w * stride_hw[1] - pad_hw[1];
-            int32_t sum = 0;
-            for (index_t kh = 0; kh < filter_height; ++kh) {
-              const index_t ih = ih_base + kh * dilation_hw[0];
-              for (index_t kw = 0; kw < filter_width; ++kw) {
-                const index_t iw = iw_base + kw * dilation_hw[1];
-                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
-                  index_t in_offset =
-                      ((b * in_height + ih) * in_width + iw) * in_channels + c;
-                  index_t filter_offset =
-                      ((kh * filter_width + kw) * in_channels + c)
-                          * depth_multiplier + o;
-
-                  sum += (input[in_offset] - input_zero) *
-                      (filter[filter_offset] - filter_zero);
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t h = start1; h < end1; h += step1) {
+          for (index_t w = 0; w < out_shape[2]; ++w) {
+            for (index_t m = 0; m < out_shape[3]; ++m) {
+              const index_t filter_height = filter_shape[0];
+              const index_t filter_width = filter_shape[1];
+              const index_t in_channels = filter_shape[2];
+              const index_t depth_multiplier = filter_shape[3];
+              const index_t in_height = in_shape[1];
+              const index_t in_width = in_shape[2];
+              const index_t out_height = out_shape[1];
+              const index_t out_width = out_shape[2];
+              const index_t out_channels = out_shape[3];
+              index_t out_offset =
+                  ((b * out_height + h) * out_width + w) * out_channels + m;
+              index_t c = m / depth_multiplier;
+              index_t o = m % depth_multiplier;
+              index_t ih_base = h * stride_hw[0] - pad_hw[0];
+              index_t iw_base = w * stride_hw[1] - pad_hw[1];
+              int32_t sum = 0;
+              for (index_t kh = 0; kh < filter_height; ++kh) {
+                const index_t ih = ih_base + kh * dilation_hw[0];
+                for (index_t kw = 0; kw < filter_width; ++kw) {
+                  const index_t iw = iw_base + kw * dilation_hw[1];
+                  if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                    index_t in_offset =
+                        ((b * in_height + ih) * in_width + iw) * in_channels
+                            + c;
+                    index_t filter_offset =
+                        ((kh * filter_width + kw) * in_channels + c)
+                            * depth_multiplier + o;
+
+                    sum += (input[in_offset] - input_zero) *
+                        (filter[filter_offset] - filter_zero);
+                  }
                 }
               }
+              if (bias) {
+                sum += bias[m];
+              }
+              sum = static_cast<int32_t>(std::round(sum * output_multiplier));
+              sum += output_zero;
+              output[out_offset] =
+                  static_cast<uint8_t>(std::min(255, std::max(0, sum)));
             }
-            if (bias) {
-              sum += bias[m];
-            }
-            sum = static_cast<int32_t>(std::round(sum * output_multiplier));
-            sum += output_zero;
-            output[out_offset] =
-                static_cast<uint8_t>(std::min(255, std::max(0, sum)));
           }
         }
       }
-    }
+    }, 0, out_shape[0], 1, 0, out_shape[1], 1);
   }
 
   inline tflite::Dims<4> ShapeToTfliteDims(const std::vector<index_t> &shape) {
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 72a50f24ce868da3ab5344062e3fa5ebeefbda2f..58852a012e84fb6664331708738adcd180519e5d 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -440,7 +440,8 @@ void TestQuant(const index_t batch,
   auto bias_data = bias->data<float>();
   float bias_scale = q_input->scale() * q_filter->scale();
   std::vector<int32_t> q_bias(bias->size());
-  QuantizeWithScaleAndZeropoint(
+  QuantizeUtil<int32_t> quantize_util(OpTestContext::Get()->thread_pool());
+  quantize_util.QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), bias_scale, 0, q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
       "QuantizedBias", {out_channels}, q_bias, true, bias_scale, 0);
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 3d203cfa5678c1ca407b6db2d441890bc00785a5..6111ea3062b241514fccca9167410f6314e4fcaf 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -16,6 +16,16 @@
 
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
+#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
+#include "mace/ops/arm/fp32/bias_add.h"
+#include "mace/ops/arm/fp32/activation.h"
+
+#else
+#include "mace/ops/ref/depthwise_deconv_2d.h"
+#include "mace/ops/ref/bias_add.h"
+#include "mace/ops/ref/activation.h"
 #endif
 
 #include <algorithm>
@@ -25,10 +35,11 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/depthwise_deconv2d_neon.h"
 #include "mace/utils/math.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/depthwise_deconv2d.h"
@@ -45,7 +56,10 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
     : public Deconv2dOpBase {
  public:
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
-      : Deconv2dOpBase(context) {}
+      : Deconv2dOpBase(context),
+        activation_delegator_(activation_,
+                              relux_max_limit_,
+                              leakyrelu_coefficient_) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
@@ -57,60 +71,12 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-    std::vector<int> out_paddings(2, 0);
-    std::vector<index_t> out_shape(4, 0);
-    std::vector<index_t> padded_out_shape(4, 0);
-
-    if (!paddings_.empty()) out_paddings = paddings_;
-    CalcDeconvShape_Caffe(
-        input->shape().data(),
-        filter->shape().data(),
-        strides_.data(),
-        out_paddings.data(),
-        group_,
-        nullptr,
-        out_shape.data(),
-        padded_out_shape.data(),
-        true);
-
-    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-    output->Clear();
-    index_t kernel_h = filter->dim(2);
-    index_t kernel_w = filter->dim(3);
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard filter_mapper(filter);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
-    auto input_data = input->data<float>();
-    auto filter_data = filter->data<float>();
-    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-
-    auto output_data = output->mutable_data<float>();
-
-    const index_t pad_left = out_paddings[0] / 2;
-    const index_t pad_top = out_paddings[1] / 2;
-
-    index_t padded_out_size =
-        PadAlignSize(std::accumulate(padded_out_shape.begin(),
-                                     padded_out_shape.end(),
-                                     1,
-                                     std::multiplies<index_t>())
-                         * sizeof(float) + MACE_EXTRA_BUFFER_PAD_SIZE);
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(padded_out_size);
-    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
-    padded_out.Reshape(padded_out_shape);
-    padded_out.Clear();
-    auto *padded_out_data = padded_out.mutable_data<float>();
-
     const index_t in_channels = input->dim(1);
-    const index_t out_channels = output->dim(1);
-
-    bool no_pad = paddings_[0] == 0 && paddings_[1] == 0;
-    float *out_data = no_pad ? output_data : padded_out_data;
+    bool is_depthwise = group_ == in_channels;
 
+#ifdef MACE_ENABLE_NEON
+    const index_t kernel_h = filter->dim(2);
+    const index_t kernel_w = filter->dim(3);
     bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
         strides_[0] == strides_[1] && strides_[0] == 1;
     bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
@@ -120,289 +86,101 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
     bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
         strides_[0] == strides_[1] && strides_[0] == 2;
 
-    bool is_depthwise = (group_ == in_channels && group_ == out_channels);
-
-    std::function<void(const float *input,
-                       const float *filter,
-                       const int group,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output)> kernel_func;
-
-    if (use_neon_3x3_s1) {
-      kernel_func = [=](const float *input,
-                        const float *filter,
-                        const int group,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        if (is_depthwise) {
-          DepthwiseDeconv2dNeonK3x3S1(input,
-                                      filter,
-                                      in_shape,
-                                      padded_out_shape,
-                                      padded_output);
-        } else {
-          GroupDeconv2dNeonK3x3S1(input,
-                                  filter,
-                                  group,
-                                  in_shape,
-                                  padded_out_shape,
-                                  padded_output);
-        }
-      };
-    } else if (use_neon_3x3_s2) {
-      kernel_func = [=](const float *input,
-                        const float *filter,
-                        const int group,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        if (is_depthwise) {
-          DepthwiseDeconv2dNeonK3x3S2(input,
-                                      filter,
-                                      in_shape,
-                                      padded_out_shape,
-                                      padded_output);
-        } else {
-          GroupDeconv2dNeonK3x3S2(input,
-                                  filter,
-                                  group,
-                                  in_shape,
-                                  padded_out_shape,
-                                  padded_output);
-        }
-      };
-    } else if (use_neon_4x4_s1) {
-      kernel_func = [=](const float *input,
-                        const float *filter,
-                        const int group,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        if (is_depthwise) {
-          DepthwiseDeconv2dNeonK4x4S1(input,
-                                      filter,
-                                      in_shape,
-                                      padded_out_shape,
-                                      padded_output);
+    if (deconv2d_delegator_ == nullptr) {
+      if (is_depthwise) {
+        if (use_neon_3x3_s1) {
+          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK3x3S1>(
+              paddings_, padding_type_, CAFFE);
+        } else if (use_neon_3x3_s2) {
+          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK3x3S2>(
+              paddings_, padding_type_, CAFFE);
+        } else if (use_neon_4x4_s1) {
+          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK4x4S1>(
+              paddings_, padding_type_, CAFFE);
+        } else if (use_neon_4x4_s2) {
+          deconv2d_delegator_ = make_unique<arm::fp32::DepthwiseDeconv2dK4x4S2>(
+              paddings_, padding_type_, CAFFE);
         } else {
-          GroupDeconv2dNeonK4x4S1(input,
-                                  filter,
-                                  group,
-                                  in_shape,
-                                  padded_out_shape,
-                                  padded_output);
+          deconv2d_delegator_ =
+              make_unique<arm::fp32::DepthwiseDeconv2dGeneral>(
+                  strides_,
+                  std::vector<int>{1, 1},
+                  paddings_,
+                  padding_type_,
+                  CAFFE);
         }
-      };
-    } else if (use_neon_4x4_s2) {
-      kernel_func = [=](const float *input,
-                        const float *filter,
-                        const int group,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        if (is_depthwise) {
-          DepthwiseDeconv2dNeonK4x4S2(input,
-                                      filter,
-                                      in_shape,
-                                      padded_out_shape,
-                                      padded_output);
-        } else {
-          GroupDeconv2dNeonK4x4S2(input,
-                                  filter,
-                                  group,
-                                  in_shape,
-                                  padded_out_shape,
-                                  padded_output);
-        }
-      };
-    } else {
-      kernel_func = [=](const float *input,
-                        const float *filter,
-                        const int group,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        if (is_depthwise) {
-          DepthwiseDeconv2dGeneral(input,
-                                   filter,
-                                   kernel_h,
-                                   kernel_w,
-                                   strides_.data(),
-                                   in_shape,
-                                   padded_out_shape,
-                                   padded_output);
+      } else {
+        if (use_neon_3x3_s1) {
+          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK3x3S1>(
+              paddings_, padding_type_, group_, CAFFE);
+        } else if (use_neon_3x3_s2) {
+          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK3x3S2>(
+              paddings_, padding_type_, group_, CAFFE);
+        } else if (use_neon_4x4_s1) {
+          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK4x4S1>(
+              paddings_, padding_type_, group_, CAFFE);
+        } else if (use_neon_4x4_s2) {
+          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dK4x4S2>(
+              paddings_, padding_type_, group_, CAFFE);
         } else {
-          GroupDeconv2dGeneral(input,
-                               filter,
-                               kernel_h,
-                               kernel_w,
-                               strides_.data(),
-                               group,
-                               in_shape,
-                               padded_out_shape,
-                               padded_output);
+          deconv2d_delegator_ = make_unique<arm::fp32::GroupDeconv2dGeneral>(
+              strides_,
+              std::vector<int>{1, 1},
+              paddings_,
+              padding_type_,
+              group_,
+              CAFFE);
         }
-      };
-    }
-
-    kernel_func(input_data,
-                filter_data,
-                group_,
-                input->shape().data(),
-                padded_out_shape.data(),
-                out_data);
-
-    if (!no_pad) {
-      CropPadOut<float>(out_data,
-                        padded_out_shape.data(),
-                        out_shape.data(),
-                        pad_left,
-                        pad_top,
-                        output_data);
+      }
     }
 
-    if (bias_data != nullptr) {
-      const index_t batch = out_shape[0];
-      const index_t channels = out_shape[1];
-      const index_t img_size = out_shape[2] * out_shape[3];
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t i = 0; i < img_size; ++i) {
-            output_data[(b * channels + c) * img_size + i] +=
-                bias_data[c];
-          }
-        }
+    deconv2d_delegator_->Compute(context,
+                                 input,
+                                 filter,
+                                 nullptr,
+                                 output);
+#else
+    if (deconv2d_delegator_ == nullptr) {
+      if (is_depthwise) {
+        deconv2d_delegator_ = make_unique<ref::DepthwiseDeconv2d<float>>(
+            strides_,
+            std::vector<int>{1, 1},
+            paddings_,
+            padding_type_,
+            CAFFE);
+      } else {
+        deconv2d_delegator_ = make_unique<ref::GroupDeconv2d<float>>(
+            strides_,
+            std::vector<int>{1, 1},
+            paddings_,
+            padding_type_,
+            group_,
+            CAFFE);
       }
     }
+    deconv2d_delegator_->Compute(context,
+                                 input,
+                                 filter,
+                                 nullptr,
+                                 output);
+#endif
 
-    DoActivation<float>(output_data,
-                        output_data,
-                        output->size(),
-                        activation_,
-                        relux_max_limit_,
-                        leakyrelu_coefficient_);
+    bias_add_delegator_.Compute(context, output, bias, output);
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
 
  private:
-  void DepthwiseDeconv2dGeneral(const float *input,
-                                const float *filter,
-                                const index_t kernel_h,
-                                const index_t kernel_w,
-                                const int *strides,
-                                const index_t *in_shape,
-                                const index_t *out_shape,
-                                float *output) {
-    const index_t batch = in_shape[0];
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-
-    const index_t channels = in_shape[1];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-
-    const index_t out_img_size = out_height * out_width;
-    const index_t in_img_size = in_height * in_width;
-
-    const int kernel_size = kernel_h * kernel_w;
-    std::vector<int> index_map(kernel_size, 0);
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        index_map[i * kernel_w + j] = i * out_width + j;
-      }
-    }
-
-#pragma omp parallel for collapse(2)
-    for (int b = 0; b < batch; ++b) {
-      for (int c = 0; c < channels; ++c) {
-        float *out_base =
-            output + (b * channels + c) * out_img_size;
-        for (int i = 0; i < in_height; ++i) {
-          for (int j = 0; j < in_width; ++j) {
-            const index_t out_offset =
-                i * strides[0] * out_width + j * strides[1];
-            const index_t input_idx =
-                (b * channels + c) * in_img_size + i * in_width + j;
-            const float val = input[input_idx];
-            const index_t kernel_offset = c * kernel_size;
-            for (int k = 0; k < kernel_size; ++k) {
-              const index_t out_idx = out_offset + index_map[k];
-              const index_t kernel_idx = kernel_offset + k;
-              out_base[out_idx] += val * filter[kernel_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void GroupDeconv2dGeneral(const float *input,
-                            const float *filter,
-                            const index_t kernel_h,
-                            const index_t kernel_w,
-                            const int *strides,
-                            const int group,
-                            const index_t *in_shape,
-                            const index_t *out_shape,
-                            float *output) {
-    const index_t out_channels = out_shape[1];
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-
-    const index_t in_channels = in_shape[1];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-
-    MACE_CHECK(in_channels % group == 0 && out_channels % group == 0,
-               "invalid input/output channel and group.");
-
-    const index_t out_img_size = out_height * out_width;
-    const index_t in_img_size = in_height * in_width;
-
-    const int kernel_size = kernel_h * kernel_w;
-    std::vector<int> index_map(kernel_size, 0);
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        index_map[i * kernel_w + j] = i * out_width + j;
-      }
-    }
-
-    const int in_channels_g = in_channels / group;
-    const int out_channels_g = out_channels / group;
-#pragma omp parallel for collapse(3)
-    for (int b = 0; b < in_shape[0]; ++b) {
-      for (int g = 0; g < group; ++g) {
-        for (int p = 0; p < out_channels_g; ++p) {
-          const index_t out_base =
-              ((b * group + g) * out_channels_g + p) * out_img_size;
-          for (int i = 0; i < in_height; ++i) {
-            for (int j = 0; j < in_width; ++j) {
-              const index_t out_offset =
-                  i * strides[0] * out_width + j * strides[1];
-              for (int q = 0; q < in_channels_g; ++q) {
-                const index_t in_base =
-                    ((b * group + g) * in_channels_g + q) * in_img_size;
-                const index_t in_offset =
-                    in_base + i * in_width + j;
-                const float val = input[in_offset];
-                const index_t k_offset =
-                    ((p * group + g) * in_channels_g + q) * kernel_size;
-                for (int k = 0; k < kernel_size; ++k) {
-                  const index_t out_idx = out_base + out_offset + index_map[k];
-                  const float w = filter[k_offset + k];
-                  output[out_idx] += val * w;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
+#ifdef MACE_ENABLE_NEON
+  std::unique_ptr<arm::fp32::Deconv2dBase> deconv2d_delegator_;
+  arm::fp32::BiasAdd bias_add_delegator_;
+  arm::fp32::Activation activation_delegator_;
+#else
+  std::unique_ptr<ref::GroupDeconv2d<float>> deconv2d_delegator_;
+  ref::BiasAdd bias_add_delegator_;
+  ref::Activation activation_delegator_;
+#endif  // MACE_ENABLE_NEON
 };
 
 #ifdef MACE_ENABLE_OPENCL
@@ -437,19 +215,22 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
 
-    std::vector<int> in_paddings(2, 0);
-    std::vector<int> out_paddings(2, 0);
-    std::vector<index_t> out_shape(4, 0);
-
-    if (!paddings_.empty()) out_paddings = paddings_;
-    CalcDeconvShape_Caffe(input->shape().data(),
-                          filter->shape().data(),
-                          strides_.data(),
-                          out_paddings.data(),
-                          group_,
-                          in_paddings.data(),
-                          out_shape.data(),
-                          nullptr);
+    std::vector<index_t> out_shape;
+    std::vector<int> in_paddings;
+    std::vector<int> out_paddings;
+
+    CalDeconvOutputShapeAndPadSize(input->shape(),
+                                   filter->shape(),
+                                   strides_,
+                                   padding_type_,
+                                   paddings_,
+                                   group_,
+                                   &out_shape,
+                                   &in_paddings,
+                                   &out_paddings,
+                                   nullptr,
+                                   CAFFE,
+                                   NHWC);
 
     return kernel_->Compute(context,
                             input,
diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc
index 0b81779e4e58bcc3915fa4a972f15607b0e11b95..0cf3de95bf5c2d077e062dcde07a232977ff8ba6 100644
--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -252,7 +252,7 @@ TEST_F(DepthwiseDeconv2dOpTest, RandomTestFloat) {
   RandomTest<float>(1, 4, 256, 256, 5, 1, 3);
   RandomTest<float>(1, 4, 256, 256, 5, 2, 4);
 }
-//
+
 TEST_F(DepthwiseDeconv2dOpTest, RandomTestHalf) {
   RandomTest<half>(1, 32, 256, 256, 5, 1, 2);
   RandomTest<half>(1, 3, 256, 256, 5, 1, 1);
diff --git a/mace/ops/dynamic_lstm.cc b/mace/ops/dynamic_lstm.cc
index 7fe93f21d6b7831bfe5fba3d21200a21923cdc2e..7d7014d57a7162184c93d1559dc1f93d0facde8c 100644
--- a/mace/ops/dynamic_lstm.cc
+++ b/mace/ops/dynamic_lstm.cc
@@ -33,10 +33,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class DynamicLSTMOp;
 
-template <typename T>
+template<typename T>
 class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit DynamicLSTMOp(OpConstructContext *context)
@@ -58,7 +58,6 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
     if (std::abs(scale - 1.f) < 1e-6)
       return;
     const index_t rounds = cell_dim / 4;
-#pragma omp parallel for schedule(runtime)
     for (index_t i = 0; i < rounds * 4; i += 4) {
 #ifdef MACE_ENABLE_NEON
       float32x4_t in_vec = vld1q_f32(cell_data + i);
@@ -86,7 +85,6 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
     }
 
     const index_t rounds = cell_dim / 4;
-#pragma omp parallel for schedule(runtime)
     for (index_t i = 0; i < rounds * 4; i += 4) {
 #ifdef MACE_ENABLE_NEON
       float32x4_t in_vec = vld1q_f32(src_data + i);
@@ -156,8 +154,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
     MACE_CHECK(lstm_params->dim(0) == 3 &&
         params_stride == lstm_cell_dim && lstm_cell_dim == prev_cell_dim_)
       << "lstm params rows:" << lstm_params->dim(0)
-      << "params_stride:"<< params_stride
-      << "!=" << "cell_dim:"<< lstm_cell_dim << std::endl;
+      << "params_stride:" << params_stride
+      << "!=" << "cell_dim:" << lstm_cell_dim << std::endl;
     const index_t affine_b_out_dim = weights_b->dim(0);
     const index_t affine_b_depth = weights_b->dim(1);
     const index_t affine_b_in_dim = lstm_cell_dim;
@@ -262,7 +260,8 @@ class DynamicLSTMOp<DeviceType::CPU, T> : public Operation {
         float *curr_cell_ptr =
             prev_cell_data + i % cell_buf_chunk * prev_cell_dim_;
         // LSTMNonlinear
-        LSTMNonlinearKernel(affine_a_out_data,
+        LSTMNonlinearKernel(context,
+                            affine_a_out_data,
                             prev_cell_ptr,
                             nullptr,
                             lstm_params_data,
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 92864ae1016fad410ce054887babd09ee2557c59..04c0e10e323a53d9e3efb042366c4ff6cc1b666d 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -31,7 +31,7 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/utils/memory.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/eltwise.h"
@@ -40,7 +40,6 @@
 namespace mace {
 namespace ops {
 
-
 inline index_t GetIndex(const std::vector<index_t> &shape,
                         const std::vector<index_t> &index) {
   index_t idx = 0;
@@ -64,8 +63,9 @@ inline void IncreaseIndex(const std::vector<index_t> &shape,
   }
 }
 
-template <typename T, typename DstType>
+template<typename T, typename DstType>
 inline void TensorGeneralBroadcastEltwise(
+    const OpContext *context,
     const EltwiseType type,
     const T *input0,
     const T *input1,
@@ -75,6 +75,8 @@ inline void TensorGeneralBroadcastEltwise(
     const std::vector<index_t> &input1_shape,
     const std::vector<index_t> &output_shape,
     DstType *output) {
+  MACE_UNUSED(context);
+
   const index_t output_size = std::accumulate(
       output_shape.begin(), output_shape.end(), 1, std::multiplies<index_t>());
   std::vector<index_t> out_index(output_shape.size(), 0);
@@ -209,13 +211,13 @@ inline void TensorGeneralBroadcastEltwise(
         IncreaseIndex(output_shape, &out_index);
       }
       break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
+    default:LOG(FATAL) << "Eltwise op not support type " << type;
   }
 }
 
-template <typename T, typename DstType>
-inline void TensorBroadcastEltwise(const EltwiseType type,
+template<typename T, typename DstType>
+inline void TensorBroadcastEltwise(const OpContext *context,
+                                   const EltwiseType type,
                                    const T *input0,
                                    const T *input1,
                                    const std::vector<float> &coeff,
@@ -223,437 +225,408 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
                                    const index_t common_size,
                                    const bool swapped,
                                    DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] + input1[i];
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    switch (type) {
+      case SUM:
+        if (coeff.empty()) {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] + input1[i];
+            }
           }
-        }
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] * coeff_copy[0] +
-                    input1[i] * coeff_copy[1];
+        } else {
+          std::vector<float> coeff_copy = coeff;
+          if (swapped) {
+            std::swap(coeff_copy[0], coeff_copy[1]);
+          }
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] * coeff_copy[0] +
+                      input1[i] * coeff_copy[1];
+            }
           }
         }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] - input1[i];
+        break;
+      case SUB:
+        if (!swapped) {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] - input1[i];
+            }
+          }
+        } else {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input1[i] - input0[i + d * common_size];
+            }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case PROD:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                input1[i] - input0[i + d * common_size];
+                input0[i + d * common_size] * input1[i];
           }
         }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
+        break;
+      case DIV:
+        if (!swapped) {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input0[i + d * common_size] / input1[i];
+            }
+          }
+        } else {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  input1[i] / input0[i + d * common_size];
+            }
+          }
         }
-      }
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] / input1[i];
+        break;
+      case FLOOR_DIV:
+        if (!swapped) {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  std::floor(input0[i + d * common_size] / input1[i]);
+            }
+          }
+        } else {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  std::floor(input1[i] / input0[i + d * common_size]);
+            }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case MIN:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                input1[i] / input0[i + d * common_size];
+                std::min(input0[i + d * common_size], input1[i]);
           }
         }
-      }
-      break;
-    case FLOOR_DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case MAX:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                std::floor(input0[i + d * common_size] / input1[i]);
+                std::max(input0[i + d * common_size], input1[i]);
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case SQR_DIFF:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                std::floor(input1[i] / input0[i + d * common_size]);
+                std::pow(input0[i + d * common_size] - input1[i], 2.f);
           }
         }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::min(input0[i + d * common_size], input1[i]);
-        }
-      }
-      break;
-    case MAX:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::max(input0[i + d * common_size], input1[i]);
+        break;
+      case POW:
+        if (!swapped) {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  std::pow(input0[i + d * common_size], input1[i]);
+            }
+          }
+        } else {
+          for (index_t d = start0; d < end0; d += step0) {
+            for (index_t i = start1; i < end1; i += step1) {
+              output[i + d * common_size] =
+                  std::pow(input1[i], input0[i + d * common_size]);
+            }
+          }
         }
-      }
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::pow(input0[i + d * common_size] - input1[i], 2.f);
+        break;
+      case NEG:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
+            output[i + d * common_size] = -input0[i + d * common_size];
+          }
         }
-      }
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case ABS:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                std::pow(input0[i + d * common_size], input1[i]);
+                std::fabs(input0[i + d * common_size]);
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
+        break;
+      case EQUAL:
+        for (index_t d = start0; d < end0; d += step0) {
+          for (index_t i = start1; i < end1; i += step1) {
             output[i + d * common_size] =
-                std::pow(input1[i], input0[i + d * common_size]);
+                input0[i + d * common_size] == input1[i];
           }
         }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < diff_size * common_size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < diff_size * common_size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              input0[i + d * common_size] == input1[i];
-        }
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type " << type;
+    }
+  }, 0, diff_size, 1, 0, common_size, 1);
 }
 
 // Multiplication is costly, so we specialize the following case.
-template <typename T, typename DstType>
-inline void TensorEltwise(const EltwiseType type,
+template<typename T, typename DstType>
+inline void TensorEltwise(const OpContext *context,
+                          const EltwiseType type,
                           const T *input0,
                           const T *input1,
                           const std::vector<float> &coeff,
                           const index_t size,
                           const bool swapped,
                           DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] + input1[i];
-        }
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] - input1[i];
-        }
+  thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+    switch (type) {
+      case SUM:
+        if (coeff.empty()) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] + input1[i];
+          }
 
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1[i] - input0[i];
+        } else {
+          std::vector<float> coeff_copy = coeff;
+          if (swapped) {
+            std::swap(coeff_copy[0], coeff_copy[1]);
+          }
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
+          }
         }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1[i];
-      }
+        break;
+      case SUB:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] - input1[i];
+          }
 
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] / input1[i];
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input1[i] - input0[i];
+          }
+        }
+        break;
+      case PROD:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = input0[i] * input1[i];
         }
 
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1[i] / input0[i];
+        break;
+      case DIV:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] / input1[i];
+          }
+
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input1[i] / input0[i];
+          }
         }
-      }
-      break;
-    case FLOOR_DIV:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::floor(input0[i] / input1[i]);
+        break;
+      case FLOOR_DIV:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::floor(input0[i] / input1[i]);
+          }
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::floor(input1[i] / input0[i]);
+          }
         }
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::floor(input1[i] / input0[i]);
+        break;
+      case MIN:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::min(input0[i], input1[i]);
         }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min(input0[i], input1[i]);
-      }
 
-      break;
-    case MAX:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max(input0[i], input1[i]);
-      }
+        break;
+      case MAX:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::max(input0[i], input1[i]);
+        }
 
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - input1[i], 2.f);
-      }
+        break;
+      case SQR_DIFF:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::pow(input0[i] - input1[i], 2.f);
+        }
 
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input0[i], input1[i]);
+        break;
+      case POW:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::pow(input0[i], input1[i]);
+          }
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::pow(input1[i], input0[i]);
+          }
         }
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input1[i], input0[i]);
+        break;
+      case NEG:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = -input0[i];
         }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] == input1[i];
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
+        break;
+      case ABS:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::fabs(input0[i]);
+        }
+        break;
+      case EQUAL:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = input0[i] == input1[i];
+        }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type " << type;
+    }
+  }, 0, size, 1);
 }
 
 // Multiplication is costly, so we specialize the following case.
-template <typename T, typename DstType>
-inline void TensorScalarEltwise(const EltwiseType type,
+template<typename T, typename DstType>
+inline void TensorScalarEltwise(const OpContext *context,
+                                const EltwiseType type,
                                 const T *input0,
                                 const T input1,
                                 const std::vector<float> &coeff,
                                 const index_t size,
                                 const bool swapped,
                                 DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] + input1;
-        }
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] - input1;
-        }
+  thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+    switch (type) {
+      case SUM:
+        if (coeff.empty()) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] + input1;
+          }
 
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1 - input0[i];
+        } else {
+          std::vector<float> coeff_copy = coeff;
+          if (swapped) {
+            std::swap(coeff_copy[0], coeff_copy[1]);
+          }
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
+          }
         }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1;
-      }
+        break;
+      case SUB:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] - input1;
+          }
 
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] / input1;
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input1 - input0[i];
+          }
+        }
+        break;
+      case PROD:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = input0[i] * input1;
         }
 
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1 / input0[i];
+        break;
+      case DIV:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input0[i] / input1;
+          }
+
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = input1 / input0[i];
+          }
         }
-      }
-      break;
-    case FLOOR_DIV:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::floor(input0[i] / input1);
+        break;
+      case FLOOR_DIV:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::floor(input0[i] / input1);
+          }
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::floor(input1 / input0[i]);
+          }
         }
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::floor(input1 / input0[i]);
+        break;
+      case MIN:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::min(input0[i], input1);
         }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min(input0[i], input1);
-      }
 
-      break;
-    case MAX:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max(input0[i], input1);
-      }
+        break;
+      case MAX:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::max(input0[i], input1);
+        }
 
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - input1, 2.f);
-      }
+        break;
+      case SQR_DIFF:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::pow(input0[i] - input1, 2.f);
+        }
 
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input0[i], input1);
+        break;
+      case POW:
+        if (!swapped) {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::pow(input0[i], input1);
+          }
+        } else {
+          for (index_t i = start; i < end; i += step) {
+            output[i] = std::pow(input1, input0[i]);
+          }
         }
-      } else {
-#pragma omp parallel for schedule(runtime)
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input1, input0[i]);
+        break;
+      case NEG:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = -input0[i];
+        }
+        break;
+      case ABS:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = std::fabs(input0[i]);
+        }
+        break;
+      case EQUAL:
+        for (index_t i = start; i < end; i += step) {
+          output[i] = input0[i] == input1;
         }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] == input1;
-      }
 
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type " << type;
+    }
+  }, 0, size, 1);
 }
 
-template <typename T, typename DstType>
-inline void TensorEltwisePerChannel(const EltwiseType type,
+template<typename T, typename DstType>
+inline void TensorEltwisePerChannel(const OpContext *context,
+                                    const EltwiseType type,
                                     const T *input0,
                                     const T *input1,
                                     const std::vector<float> &coeff,
@@ -663,230 +636,227 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
                                     const index_t image_size,
                                     const bool swapped,
                                     DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    switch (type) {
+      case SUM:
+        if (coeff.empty()) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+              }
             }
           }
-        }
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] =
-                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+        } else {
+          std::vector<float> coeff_copy = coeff;
+          if (swapped) {
+            std::swap(coeff_copy[0], coeff_copy[1]);
+          }
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] =
+                    in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+              }
             }
           }
         }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
+        break;
+      case SUB:
+        if (!swapped) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] - in1_ptr[c];
+              }
+            }
+          }
+        } else {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in1_ptr[c] - in0_ptr[i];
+              }
             }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
+        break;
+      case PROD:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
             const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
             DstType *out_ptr = output + ((b * channel) + c) * image_size;
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
+              out_ptr[i] = in0_ptr[i] * in1_ptr[c];
             }
           }
         }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
+        break;
+      case DIV:
+        if (!swapped) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in0_ptr[i] / in1_ptr[c];
+              }
+            }
+          }
+        } else {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = in1_ptr[c] / in0_ptr[i];
+              }
+            }
           }
         }
-      }
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
+        break;
+      case FLOOR_DIV:
+        if (!swapped) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = std::floor(in0_ptr[i] / in1_ptr[c]);
+              }
+            }
+          }
+        } else {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = std::floor(in1_ptr[c] / in0_ptr[i]);
+              }
             }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
+        break;
+      case MIN:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
             const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
             DstType *out_ptr = output + ((b * channel) + c) * image_size;
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
+              out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
             }
           }
         }
-      }
-      break;
-    case FLOOR_DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
+        break;
+      case MAX:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
             const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
             DstType *out_ptr = output + ((b * channel) + c) * image_size;
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::floor(in0_ptr[i] / in1_ptr[c]);
+              out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
             }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
+        break;
+      case SQR_DIFF:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
             const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
             DstType *out_ptr = output + ((b * channel) + c) * image_size;
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::floor(in1_ptr[c] / in0_ptr[i]);
+              out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
             }
           }
         }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
+        break;
+      case POW:
+        if (!swapped) {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
+              }
+            }
           }
-        }
-      }
-      break;
-    case MAX:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
+        } else {
+          for (index_t b = start0; b < end0; b += step0) {
+            for (index_t c = start1; c < end1; c += step1) {
+              const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+              const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+              DstType *out_ptr = output + ((b * channel) + c) * image_size;
+              for (index_t i = 0; i < image_size; ++i) {
+                out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
+              }
+            }
           }
         }
-      }
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
+        break;
+      case NEG:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = -input0[i];
+            }
           }
         }
-      }
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+        break;
+      case ABS:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
+              output[i] = std::fabs(input0[i]);
             }
           }
         }
-      } else {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
+        break;
+      case EQUAL:
+        for (index_t b = start0; b < end0; b += step0) {
+          for (index_t c = start1; c < end1; c += step1) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
             const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
             DstType *out_ptr = output + ((b * channel) + c) * image_size;
             for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
+              out_ptr[i] = in0_ptr[i] == in1_ptr[c];
             }
           }
         }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for schedule(runtime)
-      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = in0_ptr[i] == in1_ptr[c];
-          }
-        }
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
+        break;
+      default:LOG(FATAL) << "Eltwise op not support type " << type;
+    }
+  }, 0, batch0, 1, 0, channel, 1);
 }
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class EltwiseOp : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
@@ -915,15 +885,16 @@ class EltwiseOp : public Operation {
 
     if (IsLogicalType(type_)) {
       // as we do not have bool-type tensor, we use int type
-      return DoEltwise<int32_t>(input0, input1, output);
+      return DoEltwise<int32_t>(context, input0, input1, output);
     } else {
-      return DoEltwise<T>(input0, input1, output);
+      return DoEltwise<T>(context, input0, input1, output);
     }
   }
 
  private:
-  template <typename DstType>
-  MaceStatus DoEltwise(const Tensor *input0,
+  template<typename DstType>
+  MaceStatus DoEltwise(const OpContext *context,
+                       const Tensor *input0,
                        const Tensor *input1,
                        Tensor *output) {
     bool swapped = false;
@@ -970,12 +941,20 @@ class EltwiseOp : public Operation {
       Tensor::MappingGuard output_guard(output);
       DstType *output_ptr = output->mutable_data<DstType>();
       if (input1->size() < input0->size()) {
-        TensorEltwisePerChannel(
-            type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
-            input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
-            input0->dim(2) * input0->dim(3), swapped, output_ptr);
+        TensorEltwisePerChannel(context,
+                                type_,
+                                input0_ptr,
+                                input1_ptr,
+                                coeff_,
+                                input0->dim(0),
+                                input1->dim_size() == 1 ? 1 : input1->dim(0),
+                                input0->dim(1),
+                                input0->dim(2) * input0->dim(3),
+                                swapped,
+                                output_ptr);
       } else {
-        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
+        TensorEltwise(context,
+                      type_, input0_ptr, input1_ptr, coeff_, input0->size(),
                       swapped, output_ptr);
       }
     } else {
@@ -1002,19 +981,23 @@ class EltwiseOp : public Operation {
       }
 
       if (input1->size() == 1) {
-        TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
+        TensorScalarEltwise(context,
+                            type_, input0_ptr, input1_ptr[0], coeff_,
                             input0->size(), swapped, output_ptr);
       } else if (input0_shape == input1_shape) {
-        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
+        TensorEltwise(context,
+                      type_, input0_ptr, input1_ptr, coeff_, input0->size(),
                       swapped, output_ptr);
       } else if (need_general_broadcast) {
-        TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+        TensorGeneralBroadcastEltwise(context,
+                                      type_, input0_ptr, input1_ptr, coeff_,
                                       swapped, input0_shape, input1_shape,
                                       output_shape, output_ptr);
       } else {
         index_t common_size = input1->size();
         index_t diff_size = input0->size() / common_size;
-        TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+        TensorBroadcastEltwise(context,
+                               type_, input0_ptr, input1_ptr, coeff_,
                                diff_size, common_size, swapped, output_ptr);
       }
     }
@@ -1096,37 +1079,41 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
     auto input0_ptr = input0->data<uint8_t>();
     auto input1_ptr = input1->data<uint8_t>();
     auto output_ptr = output->mutable_data<uint8_t>();
-#pragma omp parallel for schedule(runtime)
-    for (index_t i = 0; i < output->size(); ++i) {
-      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
-      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
-      const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
-      const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
-      const int32_t multiplied_input0 =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
-                                                          input0_multiplier),
-              -input0_shift);
-      const int32_t multiplied_input1 =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
-                                                          input1_multiplier),
-              -input1_shift);
-
-      int32_t res;
-      if (type_ == SUM) {
-        res = multiplied_input0 + multiplied_input1;
-      } else {
-        res = multiplied_input0 - multiplied_input1;
-      }
 
-      const int32_t output_val =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(res,
-                                                          output_multiplier),
-              -output_shift) + output->zero_point();
-      output_ptr[i] = Saturate<uint8_t>(output_val);
-    }
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t i = start; i < end; i += step) {
+        const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+        const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+        const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+        const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+        const int32_t multiplied_input0 =
+            gemmlowp::RoundingDivideByPOT(
+                gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                            input0_multiplier),
+                -input0_shift);
+        const int32_t multiplied_input1 =
+            gemmlowp::RoundingDivideByPOT(
+                gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                            input1_multiplier),
+                -input1_shift);
+
+        int32_t res;
+        if (type_ == SUM) {
+          res = multiplied_input0 + multiplied_input1;
+        } else {
+          res = multiplied_input0 - multiplied_input1;
+        }
+
+        const int32_t output_val =
+            gemmlowp::RoundingDivideByPOT(
+                gemmlowp::SaturatingRoundingDoublingHighMul(res,
+                                                            output_multiplier),
+                -output_shift) + output->zero_point();
+        output_ptr[i] = Saturate<uint8_t>(output_val);
+      }
+    }, 0, output->size(), 1);
 #endif  // NEON
 
     return MaceStatus::MACE_SUCCESS;
@@ -1203,7 +1190,6 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterEltwise(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc
index 2d99d7a742659549c750fc1246449f35701f2277..78fed15619553b3903d8c71015b4d4228f6a5c7a 100644
--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -20,10 +20,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ExpandDimsOp;
 
-template <typename T>
+template<typename T>
 class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit ExpandDimsOp(OpConstructContext *context)
@@ -50,14 +50,15 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
       // only tensorflow support expand dim, so the default format is NHWC
       // transform NHWC to NCHW
       auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
-          {0, 3, 1, 2});
+                                                             {0, 3, 1, 2});
       output->Resize(t_output_shape);
       Tensor::MappingGuard input_guard(input);
       Tensor::MappingGuard output_guard(output);
       auto input_data = input->data<T>();
       auto output_data = output->mutable_data<T>();
 
-      Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data);
+      Transpose(&context->device()->cpu_runtime()->thread_pool(),
+                input_data, output_shape, {0, 3, 1, 2}, output_data);
     } else {
       output->Resize(output_shape);
       Tensor::MappingGuard input_guard(input);
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index 04745a055cfd519e8df365e430d952b206c843e9..64765d9c99f6a9ade2b8ef7a1a2cdd5874f3c243 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -22,8 +22,8 @@
 #include "mace/ops/activation.h"
 
 #ifdef MACE_ENABLE_NEON
-
 #include "mace/ops/arm/fp32/gemv.h"
+#include "mace/ops/arm/fp32/activation.h"
 
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/arm/q8/gemv.h"
@@ -31,6 +31,7 @@
 
 #else
 #include "mace/ops/ref/gemv.h"
+#include "mace/ops/ref/activation.h"
 #endif  // MACE_ENABLE_NEON
 
 #ifdef MACE_ENABLE_OPENCL
@@ -69,7 +70,10 @@ template<>
 class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
  public:
   explicit FullyConnectedOp(OpConstructContext *context)
-      : FullyConnectedOpBase(context) {}
+      : FullyConnectedOpBase(context),
+        activation_delegator_(activation_,
+                              relux_max_limit_,
+                              leakyrelu_coefficient_) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -106,10 +110,8 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
                   false,
                   true,
                   output);
-    Tensor::MappingGuard guard_output(output);
-    float *output_ptr = output->mutable_data<float>();
-    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_, leakyrelu_coefficient_);
+
+    activation_delegator_.Compute(context, output, output);
 
     return MaceStatus::MACE_SUCCESS;
   }
@@ -117,8 +119,10 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
  private:
 #ifdef MACE_ENABLE_NEON
   arm::fp32::Gemv gemv_;
+  arm::fp32::Activation activation_delegator_;
 #else
   ref::Gemv<float> gemv_;
+  ref::Activation activation_delegator_;
 #endif  // MACE_ENABLE_NEON
 };
 
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index ce10c97e606c6394fe1046a35b6978099dd313b6..64fead6e05bc4a1d552d20e55a8645b589751968 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -261,7 +261,9 @@ void QuantRandom(const index_t batch,
   auto bias_data = bias->data<float>();
   float bias_scale = q_input->scale() * q_weight->scale();
   std::vector<int32_t> q_bias(bias->size());
-  QuantizeWithScaleAndZeropoint(
+
+  QuantizeUtil<int32_t> quantize_util(OpTestContext::Get()->thread_pool());
+  quantize_util.QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), bias_scale, 0, q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
       "QuantizedBias", {out_channel}, q_bias, true, bias_scale, 0);
diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc
index 453a201cb71e0fd2aff59accb13b037aa50d1612..0c0551cd396af2279f47b245c371df4989143a98 100644
--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
@@ -53,16 +53,15 @@ class GatherOp : public Operation {
     const T *params_data = params->data<T>();
     T *output_data = output->mutable_data<T>();
 
-    index_t axis_dim_size = params->dim(axis_);
-    index_t lhs_size = std::accumulate(params->shape().begin(),
+    const index_t axis_dim_size = params->dim(axis_);
+    const index_t lhs_size = std::accumulate(params->shape().begin(),
                                        params->shape().begin() + axis_, 1,
                                        std::multiplies<index_t>());
-    index_t rhs_size =
+    const index_t rhs_size =
         std::accumulate(params->shape().begin() + (axis_ + 1),
                         params->shape().end(), 1, std::multiplies<index_t>());
-    index_t index_size = indices->size();
+    const index_t index_size = indices->size();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t l = 0; l < lhs_size; ++l) {
       for (index_t idx = 0; idx < index_size; ++idx) {
         MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ",
diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc
index f70d8342d12df14f131c910feae95fc10fe5b567..022ee3e7aa979ee36794f0fe6c4888012a0f0cb2 100644
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -20,10 +20,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class LocalResponseNormOp;
 
-template <>
+template<>
 class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit LocalResponseNormOp(OpConstructContext *context)
@@ -51,29 +51,35 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
     const float *input_ptr = input->data<float>();
     float *output_ptr = output->mutable_data<float>();
 
-    index_t image_size = height * width;
-    index_t batch_size = channels * image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        const int begin_input_c = std::max(static_cast<index_t>(0),
-                                           c - depth_radius_);
-        const int end_input_c = std::min(channels, c + depth_radius_ + 1);
-
-        index_t pos = b * batch_size;
-        for (index_t hw = 0; hw < height * width; ++hw, ++pos) {
-          float accum = 0.f;
-          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-            const float input_val = input_ptr[pos + input_c * image_size];
-            accum += input_val * input_val;
+    const index_t image_size = height * width;
+    const index_t batch_size = channels * image_size;
+
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t c = start1; c < end1; c += step1) {
+          const index_t begin_input_c = std::max(static_cast<index_t>(0),
+                                                 c - depth_radius_);
+          const index_t end_input_c = std::min(channels, c + depth_radius_ + 1);
+
+          index_t pos = b * batch_size;
+          for (index_t hw = 0; hw < height * width; ++hw, ++pos) {
+            float accum = 0.f;
+            for (index_t input_c = begin_input_c; input_c < end_input_c;
+                 ++input_c) {
+              const float input_val = input_ptr[pos + input_c * image_size];
+              accum += input_val * input_val;
+            }
+            const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_);
+            output_ptr[pos + c * image_size] =
+                input_ptr[pos + c * image_size] * multiplier;
           }
-          const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_);
-          output_ptr[pos + c * image_size] =
-              input_ptr[pos + c * image_size] * multiplier;
         }
       }
-    }
+    }, 0, batch, 1, 0, channels, 1);
 
     return MaceStatus::MACE_SUCCESS;
   }
diff --git a/mace/ops/lstm_nonlinear.cc b/mace/ops/lstm_nonlinear.cc
index 745c4d79674c6e2becc2eb49b2d855a2819a0e15..596c9ad77bd3add3d99043e5a5f4ebd33db5dade 100644
--- a/mace/ops/lstm_nonlinear.cc
+++ b/mace/ops/lstm_nonlinear.cc
@@ -24,10 +24,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class LSTMNonlinearOp;
 
-template <typename T>
+template<typename T>
 class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit LSTMNonlinearOp(OpConstructContext *context)
@@ -45,12 +45,7 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
       << "The input dim size should >= 2";
     MACE_CHECK(params->dim_size() == 2)
       << "The params dim size should be 2";
-    return Compute(input, params, output);
-  }
 
-  MaceStatus Compute(const Tensor *input,
-                     const Tensor *params,
-                     Tensor *output) {
     const std::vector<index_t> &input_shape = input->shape();
     const std::vector<index_t> &params_shape = params->shape();
 
@@ -77,7 +72,7 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
     const float *input_data = input->data<T>();
     const float *params_data = params->data<T>();
     float *output_data = output->mutable_data<T>();
-#pragma omp parallel for schedule(runtime)
+
     for (int r = 0; r < num_rows; ++r) {
       const float *input_row = input_data + r * input_cols;
       const float *prev_row = input_row + 4 * cell_dim;
@@ -85,7 +80,8 @@ class LSTMNonlinearOp<DeviceType::CPU, T> : public Operation {
           embed_scales ? prev_row + cell_dim : nullptr;
       float *output_cell = output_data + r * output_dim;
       float *output_row = output_cell + cell_dim;
-      LSTMNonlinearKernel(input_row,
+      LSTMNonlinearKernel(context,
+                          input_row,
                           prev_row,
                           scale_data,
                           params_data,
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index 3b0913de574607660b807ea133f3e797a30aca71..65df7305ea769cbbfd5a6c5ebfa8a779b95fe954 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -195,12 +195,18 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
         Tensor::MappingGuard c_guard(C);
         const float *bias_data = bias->data<float>();
         float *c_data = C->mutable_data<float>();
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (index_t i = 0; i < batch * rows; ++i) {
-          for (index_t w = 0; w < cols; ++w) {
-            c_data[i * cols + w] += bias_data[w];
+
+        utils::ThreadPool
+            &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+        thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                  index_t start1, index_t end1, index_t step1) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t w = start1; w < end1; w += step1) {
+              c_data[i * cols + w] += bias_data[w];
+            }
           }
-        }
+        }, 0, batch * rows, 1, 0, cols, 1);
       }
 
       return ret;
diff --git a/mace/ops/one_hot.cc b/mace/ops/one_hot.cc
index 1d243f202f1fa5ad65c4abd58892df2a31dd9155..1596286af6ae4af96e5e7d01194fa5eff7e235a2 100644
--- a/mace/ops/one_hot.cc
+++ b/mace/ops/one_hot.cc
@@ -78,7 +78,6 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
       const index_t batch = input->dim(0);
 
       if (axis == 1) {
-#pragma omp parallel for collapse(2)
         for (index_t i = 0; i < batch; ++i) {
           for (index_t j = 0; j < depth_; ++j) {
             output_ptr[i * depth_ + j] = input_ptr[i] == j ? on_value_ :
@@ -86,7 +85,6 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
         for (index_t i = 0; i < depth_; ++i) {
           for (index_t j = 0; j < batch; ++j) {
             output_ptr[i * batch + j] = input_ptr[j] == i ? on_value_ :
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index dbb6eab64c22f2941c2710f6a2730a527149f6c3..20dc6d1ac9da37ca99bc70eed9905afbfd89ceb7 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -29,7 +29,7 @@
 namespace mace {
 namespace ops {
 // Only used for GPU Operation(BufferTransform)
-template <typename T>
+template<typename T>
 class OpenCLBufferTransformer {
  public:
   OpenCLBufferTransformer(const MemoryType in_mem_type,
@@ -79,10 +79,12 @@ class OpenCLBufferTransformer {
           const float *input_ptr = input->data<float>();
           Tensor::MappingGuard guard(internal_tensor);
           float *internal_ptr = internal_tensor->mutable_data<float>();
-          MACE_RETURN_IF_ERROR(ops::Transpose(input_ptr,
-                                              input->shape(),
-                                              dst_dims,
-                                              internal_ptr));
+          MACE_RETURN_IF_ERROR(ops::Transpose(
+              &context->device()->cpu_runtime()->thread_pool(),
+              input_ptr,
+              input->shape(),
+              dst_dims,
+              internal_ptr));
         } else {
           internal_tensor->Resize(input->shape());
           const uint8_t *input_ptr = input->data<uint8_t>();
@@ -117,7 +119,8 @@ class OpenCLBufferTransformer {
         const float *internal_ptr = internal_tensor.data<float>();
         output->Resize(output_shape);
         float *output_ptr = output->mutable_data<float>();
-        return ops::Transpose(internal_ptr,
+        return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(),
+                              internal_ptr,
                               internal_tensor.shape(),
                               dst_dims,
                               output_ptr);
@@ -147,7 +150,7 @@ class OpenCLBufferTransformer {
 
 std::string TransformedFilterName(const std::string &name);
 
-template <typename T>
+template<typename T>
 MaceStatus TransformFilter(
     mace::OpConstructContext *context,
     OperatorDef *op_def,
diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h
index 7a4bf2b55934a1880447c6b6c1b5a3be87915ac4..a2bdc65280fd82cdd244c0c949e2753765a3bf6d 100644
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -35,7 +35,7 @@ template <typename T>
 class ReduceKernel : public OpenCLReduceKernel {
  public:
   ReduceKernel(ReduceType type,
-               const std::vector<int> axis,
+               const std::vector<int> &axis,
                const bool keep_dims)
       : reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {}
 
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index 27a0bc30533f4538a537dc6c3084178ee1d5d3cd..40b83fa62e757b1f13a1e06c6f91b6db1e29ab1b 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -241,7 +241,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
   bool input_changed = !IsVecEqual(*prev_input_shape, input->shape());
   *prev_input_shape = input->shape();
 
-  auto output_shape = output->shape();
+  auto &output_shape = output->shape();
   const index_t round_h =
       (output_shape[1] + wino_blk_size - 1) / wino_blk_size;
   const index_t round_w =
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index c1569204bdc11895ff47392838e9987bdf2ef75b..ab61e8c627fd72d4cb8c2c279f9567e92692df23 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -101,55 +101,50 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const {
 }
 
 namespace {
-#ifdef MACE_ENABLE_OPENCL
 std::string GetStoragePathFromEnv() {
   char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH");
   if (storage_path_str == nullptr) return "";
   return storage_path_str;
 }
-#endif
 }  // namespace
 
 OpTestContext *OpTestContext::Get(int num_threads,
-                                  CPUAffinityPolicy cpu_affinity_policy,
-                                  bool use_gemmlowp) {
+                                  CPUAffinityPolicy cpu_affinity_policy) {
   static OpTestContext instance(num_threads,
-                                cpu_affinity_policy,
-                                use_gemmlowp);
+                                cpu_affinity_policy);
   return &instance;
 }
 
 OpTestContext::OpTestContext(int num_threads,
-                             CPUAffinityPolicy cpu_affinity_policy,
-
-#ifdef MACE_ENABLE_OPENCL
-                             bool use_gemmlowp)
+                             CPUAffinityPolicy cpu_affinity_policy)
     : gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())),
-      opencl_mem_types_({MemoryType::GPU_IMAGE}) {
-#else
-                             bool use_gemmlowp) {
-#endif
+      opencl_mem_types_({MemoryType::GPU_IMAGE}),
+      thread_pool_(make_unique<utils::ThreadPool>(num_threads,
+                                                  cpu_affinity_policy)) {
+  thread_pool_->Init();
+
   device_map_[DeviceType::CPU] = make_unique<CPUDevice>(
-      num_threads, cpu_affinity_policy, use_gemmlowp);
+      num_threads, cpu_affinity_policy, thread_pool_.get());
 
-#ifdef MACE_ENABLE_OPENCL
   device_map_[DeviceType::GPU] = make_unique<GPUDevice>(
       gpu_context_->opencl_tuner(),
       gpu_context_->opencl_cache_storage(),
       GPUPriorityHint::PRIORITY_NORMAL,
-      GPUPerfHint::PERF_HIGH);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-Device *OpTestContext::GetDevice(DeviceType device_type) {
-  return device_map_[device_type].get();
+      GPUPerfHint::PERF_HIGH,
+      nullptr,
+      num_threads,
+      cpu_affinity_policy,
+      thread_pool_.get());
 }
 
-#ifdef MACE_ENABLE_OPENCL
 std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
   return gpu_context_;
 }
 
+Device *OpTestContext::GetDevice(DeviceType device_type) {
+  return device_map_[device_type].get();
+}
+
 std::vector<MemoryType> OpTestContext::opencl_mem_types() {
   return opencl_mem_types_;
 }
@@ -165,7 +160,6 @@ void OpTestContext::SetOCLImageTestFlag() {
 void OpTestContext::SetOCLImageAndBufferTestFlag() {
   opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
 }
-#endif  // MACE_ENABLE_OPENCL
 
 bool OpsTestNet::Setup(mace::DeviceType device) {
   NetDef net_def;
@@ -237,7 +231,6 @@ MaceStatus OpsTestNet::Run() {
 
 MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
   if (device == DeviceType::GPU) {
-#ifdef MACE_ENABLE_OPENCL
     auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
     for (auto type : opencl_mem_types) {
       OpTestContext::Get()->GetDevice(device)
@@ -246,9 +239,6 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
       MACE_RETURN_IF_ERROR(Run());
     }
     return MaceStatus::MACE_SUCCESS;
-#else
-    return MaceStatus::MACE_UNSUPPORTED;
-#endif
   } else {
     Setup(device);
     return Run();
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 871803234236de5c3833468dfa785dd339e3ee16..e9ef4d90f89807f8b123b5e3cba75c075ab52657 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -26,22 +26,20 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "mace/core/types.h"
 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
+#include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/math.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #include "mace/ops/testing/test_utils.h"
 
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/gpu_device.h"
-#include "mace/core/runtime/opencl/opencl_util.h"
-#endif
-
 namespace mace {
 namespace ops {
 namespace test {
@@ -79,30 +77,26 @@ class OpTestContext {
  public:
   static OpTestContext *Get(
       int num_threads = -1,
-      CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY,
-      bool use_gemmlowp = true);
-  Device *GetDevice(DeviceType device_type);
-
-#ifdef MACE_ENABLE_OPENCL
+      CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY);
   std::shared_ptr<GPUContext> gpu_context() const;
+  Device *GetDevice(DeviceType device_type);
   std::vector<MemoryType> opencl_mem_types();
   void SetOCLBufferTestFlag();
   void SetOCLImageTestFlag();
   void SetOCLImageAndBufferTestFlag();
-#endif
+  utils::ThreadPool *thread_pool() {
+    return thread_pool_.get();
+  }
 
  private:
   OpTestContext(int num_threads,
-                CPUAffinityPolicy cpu_affinity_policy,
-                bool use_gemmlowp);
+                CPUAffinityPolicy cpu_affinity_policy);
   MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
 
-  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
-
-#ifdef MACE_ENABLE_OPENCL
   std::shared_ptr<GPUContext> gpu_context_;
   std::vector<MemoryType> opencl_mem_types_;
-#endif
+  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
+  std::unique_ptr<utils::ThreadPool> thread_pool_;
 };
 
 class OpsTestNet {
@@ -430,9 +424,7 @@ class OpsTestBase : public ::testing::Test {
   }
 
   virtual void TearDown() {
-#ifdef MACE_ENABLE_OPENCL
     OpTestContext::Get()->SetOCLImageTestFlag();
-#endif
   }
 };
 
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index aaa6b230f4b5237dc88d16e369dcf289a8fe9df6..e0a94f4a7f5b2f6a00eddd816b3b92ae9da816d1 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -84,7 +84,6 @@ class PadOp<DeviceType::CPU, T> : public Operation {
     if (type_ == PadType::CONSTANT) {
       std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
 
-#pragma omp parallel for collapse(3)
       for (index_t b = 0; b < batch; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           for (index_t h = 0; h < height; ++h) {
@@ -109,7 +108,6 @@ class PadOp<DeviceType::CPU, T> : public Operation {
       const int l_add = type_ == PadType::REFLECT ?  0 : -1;
       const int r_add = type_ == PadType::REFLECT ? -2 : -1;
 
-#pragma omp parallel for collapse(1)
       for (index_t h = 0; h < o_height; ++h) {
         index_t h_in = get_src_idx(h, height, paddings_[4], l_add, r_add);
 
diff --git a/mace/ops/pad_context.cc b/mace/ops/pad_context.cc
index 6c463ec9830b2e22e234cef6e4ec7eddc61d9906..8370f9f56d03056b6e9c905771abfbcadbf2c1b9 100644
--- a/mace/ops/pad_context.cc
+++ b/mace/ops/pad_context.cc
@@ -63,7 +63,6 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
     for (index_t i = 0; i < batch; ++i) {
       T *out_base = output_data + i * output_chunk * dim;
       const T *in_base = input_data + i * chunk * dim;
-#pragma omp parallel for schedule(runtime)
       for (index_t j = 0; j < left_context_; ++j) {
         memcpy(out_base + j * dim, in_base, dim * sizeof(T));
       }
@@ -71,7 +70,6 @@ class PadContextOp<DeviceType::CPU, T> : public Operation {
       memcpy(out_base, in_base, chunk * dim * sizeof(T));
       out_base = out_base + chunk * dim;
       in_base = in_base + (chunk -1) * dim;
-#pragma omp parallel for schedule(runtime)
       for (index_t j = 0; j < right_context_; ++j) {
         memcpy(out_base + j * dim, in_base, dim * sizeof(T));
       }
diff --git a/mace/ops/pnorm.cc b/mace/ops/pnorm.cc
index 6964c6810bac50e59350410f009ac85c85f44ed6..1d0d6698604834fdd58fb390171d21d0976780ec 100644
--- a/mace/ops/pnorm.cc
+++ b/mace/ops/pnorm.cc
@@ -28,14 +28,13 @@
 
 #include "mace/core/operator.h"
 
-
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class PNormOp;
 
-template <typename T>
+template<typename T>
 class PNormOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit PNormOp(OpConstructContext *context)
@@ -52,7 +51,7 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
     const index_t dim_size = input_shape.size();
     MACE_CHECK(dim_size >= 1, "PNorm only supports input dim size >= 1");
     std::vector<index_t> output_shape(input_shape);
-    const index_t input_dim = input_shape[dim_size -1];
+    const index_t input_dim = input_shape[dim_size - 1];
     MACE_CHECK(output_dim_ > 0,
                "Output dim should be greater than zero.");
     MACE_CHECK(input_dim % output_dim_ == 0 && output_dim_ < input_dim,
@@ -69,48 +68,59 @@ class PNormOp<DeviceType::CPU, T> : public Operation {
     const index_t bh =
         std::accumulate(input->shape().begin(), input->shape().end() - 1, 1,
                         std::multiplies<index_t>());
+
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     if (p_ == 0) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t i = 0; i < bh; ++i) {
-        for (index_t j = 0; j < output_dim_; ++j) {
-          const T *in_base = input_data + i * input_dim + j * group_size;
-          T *out_base = output_data + i * output_dim_;
-          T temp_result = 0;
-          for (index_t g = 0; g < group_size; ++g) {
-            T value =
-                (std::fabs(in_base[g])
-                    > std::numeric_limits<float>::epsilon()) ? 1.0f : 0.0f;
-            temp_result += value;
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            const T *in_base = input_data + i * input_dim + j * group_size;
+            T *out_base = output_data + i * output_dim_;
+            T temp_result = 0;
+            for (index_t g = 0; g < group_size; ++g) {
+              T value =
+                  (std::fabs(in_base[g])
+                      > std::numeric_limits<float>::epsilon()) ? 1.0f : 0.0f;
+              temp_result += value;
+            }
+            out_base[j] = temp_result;
           }
-          out_base[j] = temp_result;
         }
-      }
+      }, 0, bh, 1, 0, output_dim_, 1);
+
     } else if (p_ == 1) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t i = 0; i < bh; ++i) {
-        for (index_t j = 0; j < output_dim_; ++j) {
-          const T *in_base = input_data + i * input_dim + j * group_size;
-          T *out_base = output_data + i * output_dim_;
-          T temp_result = 0;
-          for (index_t g = 0; g < group_size; ++g) {
-            temp_result += std::abs(in_base[g]);;
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            const T *in_base = input_data + i * input_dim + j * group_size;
+            T *out_base = output_data + i * output_dim_;
+            T temp_result = 0;
+            for (index_t g = 0; g < group_size; ++g) {
+              temp_result += std::abs(in_base[g]);;
+            }
+            out_base[j] = temp_result;
           }
-          out_base[j] = temp_result;
         }
-      }
+      }, 0, bh, 1, 0, output_dim_, 1);
     } else if (p_ == 2) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (index_t i = 0; i < bh; ++i) {
-        for (index_t j = 0; j < output_dim_; ++j) {
-          const T *in_base = input_data + i * input_dim + j * group_size;
-          T *out_base = output_data + i * output_dim_;
-          T temp_result = 0;
-          for (index_t g = 0; g < group_size; ++g) {
-            temp_result += in_base[g] * in_base[g];
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            const T *in_base = input_data + i * input_dim + j * group_size;
+            T *out_base = output_data + i * output_dim_;
+            T temp_result = 0;
+            for (index_t g = 0; g < group_size; ++g) {
+              temp_result += in_base[g] * in_base[g];
+            }
+            out_base[j] = std::sqrt(temp_result);
           }
-          out_base[j] = std::sqrt(temp_result);
         }
-      }
+      }, 0, bh, 1, 0, output_dim_, 1);
     } else {
       LOG(FATAL) << "PNorm's p should be 0, 1 or 2, here p is: " << p_;
     }
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 969f2774e3bb5a5fcf35e37e5f613f2f87b9f19b..52842c5230a299ade8af2d85e24ba23f00052e30 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -57,10 +57,10 @@ class PoolingOpBase : public ConvPool2dOpBase {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class PoolingOp;
 
-template <>
+template<>
 class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
@@ -99,7 +99,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
     int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
 
     if (pooling_type_ == PoolingType::MAX) {
-      MaxPooling(input,
+      MaxPooling(context,
+                 input,
                  input_shape,
                  output_shape.data(),
                  kernels_.data(),
@@ -108,7 +109,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
                  pad_hw,
                  output);
     } else if (pooling_type_ == PoolingType::AVG) {
-      AvgPooling(input,
+      AvgPooling(context,
+                 input,
                  input_shape,
                  output_shape.data(),
                  kernels_.data(),
@@ -124,7 +126,8 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
   }
 
  private:
-  void MaxPooling(const float *input,
+  void MaxPooling(const OpContext *context,
+                  const float *input,
                   const index_t *in_shape,
                   const index_t *out_shape,
                   const int *filter_hw,
@@ -132,45 +135,56 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
                   const int *dilation_hw,
                   const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = in_shape[1] * in_image_size;
-    const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t c = 0; c < out_shape[1]; ++c) {
-        const index_t out_base = b * out_batch_size + c * out_image_size;
-        const index_t in_base = b * in_batch_size + c * in_image_size;
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_height = in_shape[2];
-        const index_t in_width = in_shape[3];
-
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
-            const index_t out_offset = out_base + h * out_width + w;
-            float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_hw[0]; ++fh) {
-              for (int fw = 0; fw < filter_hw[1]; ++fw) {
-                index_t inh =
-                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
-                index_t inw =
-                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
-                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                  index_t input_offset = in_base + inh * in_width + inw;
-                  res = std::max(res, input[input_offset]);
+    const index_t batch = out_shape[0];
+    const index_t out_channels = out_shape[1];
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+    const index_t in_channels = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    const index_t in_image_size = in_height * in_width;
+    const index_t out_image_size = out_height * out_width;
+    const index_t in_batch_size = in_channels * in_image_size;
+    const index_t out_batch_size = out_channels * out_image_size;
+
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t c = start1; c < end1; c += step1) {
+          const index_t out_base = b * out_batch_size + c * out_image_size;
+          const index_t in_base = b * in_batch_size + c * in_image_size;
+
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w < out_width; ++w) {
+              const index_t out_offset = out_base + h * out_width + w;
+              float res = std::numeric_limits<float>::lowest();
+              for (int fh = 0; fh < filter_hw[0]; ++fh) {
+                for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                  index_t inh =
+                      h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                  index_t inw =
+                      w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
+                  if (inh >= 0 && inh < in_height && inw >= 0
+                      && inw < in_width) {
+                    index_t input_offset = in_base + inh * in_width + inw;
+                    res = std::max(res, input[input_offset]);
+                  }
                 }
               }
+              output[out_offset] = res;
             }
-            output[out_offset] = res;
           }
         }
       }
-    }
+    }, 0, batch, 1, 0, out_channels, 1);
   }
 
-  void AvgPooling(const float *input,
+  void AvgPooling(const OpContext *context,
+                  const float *input,
                   const index_t *in_shape,
                   const index_t *out_shape,
                   const int *filter_hw,
@@ -178,48 +192,62 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
                   const int *dilation_hw,
                   const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = in_shape[1] * in_image_size;
-    const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2) schedule(runtime)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t c = 0; c < out_shape[1]; ++c) {
-        const index_t out_base = b * out_batch_size + c * out_image_size;
-        const index_t in_base = b * in_batch_size + c * in_image_size;
-        const index_t in_height = in_shape[2];
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
-            const index_t out_offset = out_base + h * out_width + w;
-            float res = 0;
-            int block_size = 0;
-            for (int fh = 0; fh < filter_hw[0]; ++fh) {
-              for (int fw = 0; fw < filter_hw[1]; ++fw) {
-                index_t inh =
-                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
-                index_t inw =
-                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
-                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                  index_t input_offset = in_base + inh * in_width + inw;
-                  res += input[input_offset];
-                  ++block_size;
+    const index_t batch = out_shape[0];
+    const index_t out_channels = out_shape[1];
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+    const index_t in_channels = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    const index_t in_image_size = in_height * in_width;
+    const index_t out_image_size = out_height * out_width;
+    const index_t in_batch_size = in_channels * in_image_size;
+    const index_t out_batch_size = out_channels * out_image_size;
+
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t c = start1; c < end1; c += step1) {
+          const index_t out_base = b * out_batch_size + c * out_image_size;
+          const index_t in_base = b * in_batch_size + c * in_image_size;
+          const index_t in_height = in_shape[2];
+          const index_t in_width = in_shape[3];
+          const index_t out_height = out_shape[2];
+          const index_t out_width = out_shape[3];
+          for (index_t h = 0; h < out_height; ++h) {
+            for (index_t w = 0; w < out_width; ++w) {
+              const index_t out_offset = out_base + h * out_width + w;
+              float res = 0;
+              int block_size = 0;
+              for (int fh = 0; fh < filter_hw[0]; ++fh) {
+                for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                  index_t inh =
+                      h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                  index_t inw =
+                      w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
+                  if (inh >= 0 && inh < in_height && inw >= 0
+                      && inw < in_width) {
+                    index_t input_offset = in_base + inh * in_width + inw;
+                    res += input[input_offset];
+                    ++block_size;
+                  }
                 }
               }
+              output[out_offset] = res / block_size;
             }
-            output[out_offset] = res / block_size;
           }
         }
       }
-    }
+    }, 0, batch, 1, 0, out_channels, 1);
   }
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
@@ -275,7 +303,8 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
     int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
 
     if (pooling_type_ == PoolingType::MAX) {
-      MaxPooling(input,
+      MaxPooling(context,
+                 input,
                  input_tensor->shape().data(),
                  output_shape.data(),
                  kernels_.data(),
@@ -283,7 +312,8 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
                  pad_hw,
                  output);
     } else if (pooling_type_ == PoolingType::AVG) {
-      AvgPooling(input,
+      AvgPooling(context,
+                 input,
                  input_tensor->shape().data(),
                  output_shape.data(),
                  kernels_.data(),
@@ -298,131 +328,145 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
   }
 
  private:
-  void MaxPooling(const uint8_t *input,
+  void MaxPooling(const OpContext *context,
+                  const uint8_t *input,
                   const index_t *in_shape,
                   const index_t *out_shape,
                   const int *filter_hw,
                   const int *stride_hw,
                   const int *pad_hw,
                   uint8_t *output) {
-#pragma omp parallel for collapse(3) schedule(runtime)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          const index_t out_height = out_shape[1];
-          const index_t out_width = out_shape[2];
-          const index_t channels = out_shape[3];
-          const index_t in_height = in_shape[1];
-          const index_t in_width = in_shape[2];
-          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
-          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
-          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
-          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
-          const index_t in_h_end =
-              std::min(in_height, in_h_base + filter_hw[0]);
-          const index_t in_w_end =
-              std::min(in_width, in_w_base + filter_hw[1]);
-
-          uint8_t *out_ptr =
-              output + ((b * out_height + h) * out_width + w) * channels;
-          std::fill_n(out_ptr, channels, 0);
-          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
-            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
-              const uint8_t *in_ptr = input +
-                  ((b * in_height + ih) * in_width + iw) * channels;
-              index_t c = 0;
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t h = start1; h < end1; h += step1) {
+          for (index_t w = start2; w < end2; w += step2) {
+            const index_t out_height = out_shape[1];
+            const index_t out_width = out_shape[2];
+            const index_t channels = out_shape[3];
+            const index_t in_height = in_shape[1];
+            const index_t in_width = in_shape[2];
+            const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
+            const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
+            const index_t in_h_begin = std::max<index_t>(0, in_h_base);
+            const index_t in_w_begin = std::max<index_t>(0, in_w_base);
+            const index_t in_h_end =
+                std::min(in_height, in_h_base + filter_hw[0]);
+            const index_t in_w_end =
+                std::min(in_width, in_w_base + filter_hw[1]);
+
+            uint8_t *out_ptr =
+                output + ((b * out_height + h) * out_width + w) * channels;
+            std::fill_n(out_ptr, channels, 0);
+            for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
+              for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
+                const uint8_t *in_ptr = input +
+                    ((b * in_height + ih) * in_width + iw) * channels;
+                index_t c = 0;
 #if defined(MACE_ENABLE_NEON)
-              for (; c <= channels - 16; c += 16) {
-                uint8x16_t out_vec = vld1q_u8(out_ptr + c);
-                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
-                out_vec = vmaxq_u8(out_vec, in_vec);
-                vst1q_u8(out_ptr + c, out_vec);
-              }
-              for (; c <= channels - 8; c += 8) {
-                uint8x8_t out_vec = vld1_u8(out_ptr + c);
-                uint8x8_t in_vec = vld1_u8(in_ptr + c);
-                out_vec = vmax_u8(out_vec, in_vec);
-                vst1_u8(out_ptr + c, out_vec);
-              }
+                for (; c <= channels - 16; c += 16) {
+                  uint8x16_t out_vec = vld1q_u8(out_ptr + c);
+                  uint8x16_t in_vec = vld1q_u8(in_ptr + c);
+                  out_vec = vmaxq_u8(out_vec, in_vec);
+                  vst1q_u8(out_ptr + c, out_vec);
+                }
+                for (; c <= channels - 8; c += 8) {
+                  uint8x8_t out_vec = vld1_u8(out_ptr + c);
+                  uint8x8_t in_vec = vld1_u8(in_ptr + c);
+                  out_vec = vmax_u8(out_vec, in_vec);
+                  vst1_u8(out_ptr + c, out_vec);
+                }
 #endif
-              for (; c < channels; ++c) {
-                out_ptr[c] = std::max(out_ptr[c], in_ptr[c]);
+                for (; c < channels; ++c) {
+                  out_ptr[c] = std::max(out_ptr[c], in_ptr[c]);
+                }
               }
             }
           }
         }
       }
-    }
+    }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1);
   }
 
-  void AvgPooling(const uint8_t *input,
+  void AvgPooling(const OpContext *context,
+                  const uint8_t *input,
                   const index_t *in_shape,
                   const index_t *out_shape,
                   const int *filter_hw,
                   const int *stride_hw,
                   const int *pad_hw,
                   uint8_t *output) {
-#pragma omp parallel for collapse(3) schedule(runtime)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          const index_t out_height = out_shape[1];
-          const index_t out_width = out_shape[2];
-          const index_t channels = out_shape[3];
-          const index_t in_height = in_shape[1];
-          const index_t in_width = in_shape[2];
-          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
-          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
-          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
-          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
-          const index_t in_h_end =
-              std::min(in_height, in_h_base + filter_hw[0]);
-          const index_t in_w_end =
-              std::min(in_width, in_w_base + filter_hw[1]);
-          const index_t block_size =
-              (in_h_end - in_h_begin) * (in_w_end - in_w_begin);
-          MACE_CHECK(block_size > 0);
-
-          std::vector<uint16_t> average_buffer(channels);
-          uint16_t *avg_buffer = average_buffer.data();
-          std::fill_n(avg_buffer, channels, 0);
-          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
-            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
-              const uint8_t *in_ptr = input +
-                  ((b * in_height + ih) * in_width + iw) * channels;
-              index_t c = 0;
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+    thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t h = start1; h < end1; h += step1) {
+          for (index_t w = start2; w < end2; w += step2) {
+            const index_t out_height = out_shape[1];
+            const index_t out_width = out_shape[2];
+            const index_t channels = out_shape[3];
+            const index_t in_height = in_shape[1];
+            const index_t in_width = in_shape[2];
+            const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
+            const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
+            const index_t in_h_begin = std::max<index_t>(0, in_h_base);
+            const index_t in_w_begin = std::max<index_t>(0, in_w_base);
+            const index_t in_h_end =
+                std::min(in_height, in_h_base + filter_hw[0]);
+            const index_t in_w_end =
+                std::min(in_width, in_w_base + filter_hw[1]);
+            const index_t block_size =
+                (in_h_end - in_h_begin) * (in_w_end - in_w_begin);
+            MACE_CHECK(block_size > 0);
+
+            std::vector<uint16_t> average_buffer(channels);
+            uint16_t *avg_buffer = average_buffer.data();
+            std::fill_n(avg_buffer, channels, 0);
+            for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
+              for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
+                const uint8_t *in_ptr = input +
+                    ((b * in_height + ih) * in_width + iw) * channels;
+                index_t c = 0;
 #if defined(MACE_ENABLE_NEON)
-              for (; c <= channels - 16; c += 16) {
-                uint16x8_t avg_vec[2];
-                avg_vec[0] = vld1q_u16(avg_buffer + c);
-                avg_vec[1] = vld1q_u16(avg_buffer + c + 8);
-                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
-                avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec));
-                avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec));
-                vst1q_u16(avg_buffer + c, avg_vec[0]);
-                vst1q_u16(avg_buffer + c + 8, avg_vec[1]);
-              }
-              for (; c <= channels - 8; c += 8) {
-                uint16x8_t avg_vec = vld1q_u16(avg_buffer + c);
-                uint8x8_t in_vec = vld1_u8(in_ptr + c);
-                avg_vec = vaddw_u8(avg_vec, in_vec);
-                vst1q_u16(avg_buffer + c, avg_vec);
-              }
+                for (; c <= channels - 16; c += 16) {
+                  uint16x8_t avg_vec[2];
+                  avg_vec[0] = vld1q_u16(avg_buffer + c);
+                  avg_vec[1] = vld1q_u16(avg_buffer + c + 8);
+                  uint8x16_t in_vec = vld1q_u8(in_ptr + c);
+                  avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec));
+                  avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec));
+                  vst1q_u16(avg_buffer + c, avg_vec[0]);
+                  vst1q_u16(avg_buffer + c + 8, avg_vec[1]);
+                }
+                for (; c <= channels - 8; c += 8) {
+                  uint16x8_t avg_vec = vld1q_u16(avg_buffer + c);
+                  uint8x8_t in_vec = vld1_u8(in_ptr + c);
+                  avg_vec = vaddw_u8(avg_vec, in_vec);
+                  vst1q_u16(avg_buffer + c, avg_vec);
+                }
 #endif
-              for (; c < channels; ++c) {
-                avg_buffer[c] += in_ptr[c];
+                for (; c < channels; ++c) {
+                  avg_buffer[c] += in_ptr[c];
+                }
               }
             }
-          }
-          uint8_t *out_ptr =
-              output + ((b * out_height + h) * out_width + w) * channels;
-          for (index_t c = 0; c < channels; ++c) {
-            out_ptr[c] = static_cast<uint8_t>(
-                (avg_buffer[c] + block_size / 2) / block_size);
+            uint8_t *out_ptr =
+                output + ((b * out_height + h) * out_width + w) * channels;
+            for (index_t c = 0; c < channels; ++c) {
+              out_ptr[c] = static_cast<uint8_t>(
+                  (avg_buffer[c] + block_size / 2) / block_size);
+            }
           }
         }
       }
-    }
+    }, 0, out_shape[0], 1, 0, out_shape[1], 1, 0, out_shape[2], 1);
   }
 };
 #endif  // MACE_ENABLE_QUANTIZE
@@ -454,7 +498,6 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterPooling(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
                    DeviceType::CPU, float);
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index f9a83a23e41ef290fde9d8005bcf8419a2b217ea..104b67bc304de59a16d54bcdc6c66c68c987c0c7 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -216,7 +216,7 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2<GPU>(); }
 namespace {
 template <DeviceType D, typename T>
 void MaxPooling3S2(const std::vector<index_t> &input_shape,
-                   const std::vector<int> strides,
+                   const std::vector<int> &strides,
                    Padding padding) {
   // Construct graph
   OpsTestNet net;
diff --git a/mace/ops/prior_box.cc b/mace/ops/prior_box.cc
index 3226d2be63f0380feac80f9ddd52cb7172da928f..62040d272d4eb7ba46ba8b6d3bc20db401f9c644 100644
--- a/mace/ops/prior_box.cc
+++ b/mace/ops/prior_box.cc
@@ -113,7 +113,6 @@ class PriorBoxOp : public Operation {
     }
 
     if (clip_) {
-#pragma omp parallel for schedule(runtime)
       for (int i = 0; i < dim; ++i) {
         T min = 0;
         T max = 1;
@@ -122,7 +121,6 @@ class PriorBoxOp : public Operation {
     }
 
     output_data += dim;
-#pragma omp parallel for schedule(runtime)
     for (int i = 0; i < dim / 4; ++i) {
       int index = i * 4;
       output_data[0 + index] = variance_[0];
diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc
index 6be719c5feb4e8ae8af3f1ad1734e9843961b8df..09354a45a5513783d9962adbbe1ea25f27b33529 100644
--- a/mace/ops/quantize.cc
+++ b/mace/ops/quantize.cc
@@ -19,15 +19,15 @@
 
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class QuantizeOp;
 
-template <>
+template<>
 class QuantizeOp<DeviceType::CPU, uint8_t> : public Operation {
  public:
   explicit QuantizeOp(OpConstructContext *context)
@@ -36,7 +36,8 @@ class QuantizeOp<DeviceType::CPU, uint8_t> : public Operation {
             static_cast<bool>(Operation::GetOptionalArg<int>("non_zero", 0))),
         find_range_every_time_(static_cast<bool>(Operation::GetOptionalArg<int>(
             "find_range_every_time",
-            0))) {}
+            0))),
+        quantize_util_(&context->device()->cpu_runtime()->thread_pool()) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -48,20 +49,20 @@ class QuantizeOp<DeviceType::CPU, uint8_t> : public Operation {
     const float *input_data = input->data<float>();
     uint8_t *output_data = output->mutable_data<uint8_t>();
     if (!find_range_every_time_ && output->scale() > 0.f) {
-      QuantizeWithScaleAndZeropoint(input_data,
-                                    input->size(),
-                                    output->scale(),
-                                    output->zero_point(),
-                                    output_data);
+      quantize_util_.QuantizeWithScaleAndZeropoint(input_data,
+                                                   input->size(),
+                                                   output->scale(),
+                                                   output->zero_point(),
+                                                   output_data);
     } else {
       float scale;
       int32_t zero_point;
-      Quantize(input_data,
-               input->size(),
-               non_zero_,
-               output_data,
-               &scale,
-               &zero_point);
+      quantize_util_.Quantize(input_data,
+                              input->size(),
+                              non_zero_,
+                              output_data,
+                              &scale,
+                              &zero_point);
       output->SetScale(scale);
       output->SetZeroPoint(zero_point);
     }
@@ -71,16 +72,18 @@ class QuantizeOp<DeviceType::CPU, uint8_t> : public Operation {
  private:
   bool non_zero_;
   bool find_range_every_time_;
+  QuantizeUtil<uint8_t> quantize_util_;
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DequantizeOp;
 
-template <typename T>
+template<typename T>
 class DequantizeOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit DequantizeOp(OpConstructContext *context)
-      : Operation(context) {}
+      : Operation(context),
+        quantize_util_(&context->device()->cpu_runtime()->thread_pool()) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -91,13 +94,16 @@ class DequantizeOp<DeviceType::CPU, T> : public Operation {
     Tensor::MappingGuard output_guard(output);
     const T *input_data = input->data<T>();
     float *output_data = output->mutable_data<float>();
-    Dequantize<T>(input_data,
-               input->size(),
-               input->scale(),
-               input->zero_point(),
-               output_data);
+    quantize_util_.Dequantize(input_data,
+                              input->size(),
+                              input->scale(),
+                              input->zero_point(),
+                              output_data);
     return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  QuantizeUtil<T> quantize_util_;
 };
 
 void RegisterQuantize(OpRegistryBase *op_registry) {
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 068212f204d85a3129d1f7ad9e9cbe0cfca06491..29ce821b84a98f8552ce4d3e60a0f9d693f39f0d 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -33,12 +33,12 @@ namespace ops {
 class ReduceOpBase : public Operation {
  public:
   explicit ReduceOpBase(OpConstructContext *context)
-  : Operation(context),
-    reduce_type_(
-        static_cast<ReduceType>(Operation::GetOptionalArg<int>(
-            "reduce_type", static_cast<int>(MEAN)))),
-    axis_(Operation::GetRepeatedArgs<int>("axis")),
-    keep_dims_(Operation::GetOptionalArg<bool>("keepdims", false)) {
+      : Operation(context),
+        reduce_type_(
+            static_cast<ReduceType>(Operation::GetOptionalArg<int>(
+                "reduce_type", static_cast<int>(MEAN)))),
+        axis_(Operation::GetRepeatedArgs<int>("axis")),
+        keep_dims_(Operation::GetOptionalArg<bool>("keepdims", false)) {
   }
 
  protected:
@@ -54,15 +54,15 @@ class ReduceOpBase : public Operation {
   }
 
  protected:
-  ReduceType  reduce_type_;
+  ReduceType reduce_type_;
   std::vector<int> axis_;
   bool keep_dims_;
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ReduceOp;
 
-template <typename T>
+template<typename T>
 class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
  public:
   explicit ReduceOp(OpConstructContext *context)
@@ -78,7 +78,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     output->SetScale(input->scale());
     output->SetZeroPoint(input->zero_point());
     output->Resize(out_shape_);
-    Compute(input, output);
+    Compute(context, input, output);
     return MaceStatus::MACE_SUCCESS;
   }
 
@@ -92,8 +92,8 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     } else {
       for (unsigned int i = 0; i < axis_.size(); ++i) {
         int index = axis_[i] >= 0 ?
-                          axis_[i] :
-                          axis_[i] + input->dim_size();
+                    axis_[i] :
+                    axis_[i] + input->dim_size();
         auto has_df = Operation::GetOptionalArg<int>(
             "has_data_format", 0);
         if (has_df && DataTypeToEnum<T>::value != DT_UINT8
@@ -128,7 +128,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
         if (n == 1) {
           bitmap[dim_index] = bitmap[dim_index - 1];
         }
-        if (bitmap[dim_index-1] != bitmap[dim_index]) {
+        if (bitmap[dim_index - 1] != bitmap[dim_index]) {
           data_reshape_.push_back(n);
         } else {
           data_reshape_.back() *= n;
@@ -137,7 +137,11 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void Reduce1Dims(const T *input, ReduceType type, T *output) {
+  void Reduce1Dims(const OpContext *context,
+                   const T *input,
+                   ReduceType type,
+                   T *output) {
+    MACE_UNUSED(context);
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
         T tmp = 0;
@@ -157,13 +161,13 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
           tmp = std::max<T>(tmp, input[i]);
         }
         output[0] = tmp;
-      }  else if (type == ReduceType::PROD) {
+      } else if (type == ReduceType::PROD) {
         T tmp = input[0];
         for (int i = 1; i < data_reshape_[0]; ++i) {
           tmp = tmp * input[i];
         }
         output[0] = tmp;
-      }  else {
+      } else {
         MACE_NOT_IMPLEMENTED;
       }
     } else {
@@ -171,359 +175,367 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void Reduce2Dims(const T *input, ReduceType type, T *output) {
+  void Reduce2Dims(const OpContext *context,
+                   const T *input,
+                   ReduceType type,
+                   T *output) {
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     if (reduce_first_axis_) {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = 0;
-          for (int j = 0; j < data_reshape_[0]; ++j) {
-            tmp += input[j * data_reshape_[1] + i];
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = 0;
+            for (int j = 0; j < data_reshape_[0]; ++j) {
+              tmp += input[j * data_reshape_[1] + i];
+            }
+            output[i] = tmp / data_reshape_[0];
           }
-          output[i] = tmp / data_reshape_[0];
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = input[i];
-          for (int j = 1; j < data_reshape_[0]; ++j) {
-            tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i];
+            for (int j = 1; j < data_reshape_[0]; ++j) {
+              tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = input[i];
-          for (int j = 1; j < data_reshape_[0]; ++j) {
-            tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i];
+            for (int j = 1; j < data_reshape_[0]; ++j) {
+              tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = input[i];
-          for (int j = 1; j < data_reshape_[0]; ++j) {
-            tmp = tmp * input[j * data_reshape_[1] + i];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i];
+            for (int j = 1; j < data_reshape_[0]; ++j) {
+              tmp = tmp * input[j * data_reshape_[1] + i];
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[1], 1);
     } else {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          T tmp = 0;
-          for (int j = 0; j < data_reshape_[1]; ++j) {
-            tmp += input[i * data_reshape_[1] + j];
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = 0;
+            for (int j = 0; j < data_reshape_[1]; ++j) {
+              tmp += input[i * data_reshape_[1] + j];
+            }
+            output[i] = tmp / data_reshape_[1];
           }
-          output[i] = tmp / data_reshape_[1];
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          T tmp = input[i * data_reshape_[1]];
-          for (int j = 1; j < data_reshape_[1]; ++j) {
-            tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i * data_reshape_[1]];
+            for (int j = 1; j < data_reshape_[1]; ++j) {
+              tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          T tmp = input[i * data_reshape_[1]];
-          for (int j = 1; j < data_reshape_[1]; ++j) {
-            tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i * data_reshape_[1]];
+            for (int j = 1; j < data_reshape_[1]; ++j) {
+              tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          T tmp = input[i * data_reshape_[1]];
-          for (int j = 1; j < data_reshape_[1]; ++j) {
-            tmp = tmp * input[i * data_reshape_[1] + j];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i * data_reshape_[1]];
+            for (int j = 1; j < data_reshape_[1]; ++j) {
+              tmp = tmp * input[i * data_reshape_[1] + j];
+            }
+            output[i] = tmp;
           }
-          output[i] = tmp;
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[0], 1);
     }
   }
 
-  void Reduce3Dims(const T *input, ReduceType type, T *output) {
+  void Reduce3Dims(const OpContext *context,
+                   const T *input,
+                   ReduceType type,
+                   T *output) {
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     if (reduce_first_axis_) {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[0]; ++k) {
-              output[i] +=
-                  input[(k * data_reshape_[1] + i) * data_reshape_[2]
-                      + j];
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[0]; ++k) {
+                output[i] +=
+                    input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                        + j];
+              }
             }
+            output[i] /= (data_reshape_[0] * data_reshape_[2]);
           }
-          output[i] /= (data_reshape_[0] * data_reshape_[2]);
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = input[i * data_reshape_[2]];
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[0]; ++k) {
-              tmp = std::min(tmp,
-                  input[(k * data_reshape_[1] + i) * data_reshape_[2]
-                      + j]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i * data_reshape_[2]];
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[0]; ++k) {
+                tmp = std::min(tmp,
+                               input[
+                                   (k * data_reshape_[1] + i) * data_reshape_[2]
+                                       + j]);
+              }
             }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = input[i * data_reshape_[2]];
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[0]; ++k) {
-              tmp =
-                  std::max(tmp,
-                           input[(k * data_reshape_[1] + i)
-                               * data_reshape_[2] + j]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = input[i * data_reshape_[2]];
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[0]; ++k) {
+                tmp =
+                    std::max(tmp,
+                             input[(k * data_reshape_[1] + i)
+                                 * data_reshape_[2] + j]);
+              }
             }
+            output[i] = tmp;
           }
-          output[i] = tmp;
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          T tmp = 1;
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[0]; ++k) {
-              tmp *=
-                  input[(k * data_reshape_[1] + i) * data_reshape_[2]
-                      + j];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = 1;
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[0]; ++k) {
+                tmp *=
+                    input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                        + j];
+              }
             }
+            output[i] = tmp;
           }
-          output[i] = tmp;
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[1], 1);
     } else {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[1]; ++k) {
-              output[i * data_reshape_[2] + j] +=
-                  input[(i * data_reshape_[1] + k) * data_reshape_[2]
-                      + j];
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                output[i * data_reshape_[2] + j] +=
+                    input[(i * data_reshape_[1] + k) * data_reshape_[2]
+                        + j];
+              }
+              output[i * data_reshape_[2] + j] /= data_reshape_[1];
             }
-            output[i * data_reshape_[2] + j] /= data_reshape_[1];
           }
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
-            for (int k = 1; k < data_reshape_[1]; ++k) {
-              tmp = std::min(tmp,
-                             input[(i * data_reshape_[1] + k) *
-                                 data_reshape_[2] + j]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+              for (int k = 1; k < data_reshape_[1]; ++k) {
+                tmp = std::min(tmp,
+                               input[(i * data_reshape_[1] + k) *
+                                   data_reshape_[2] + j]);
+              }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
-            for (int k = 1; k < data_reshape_[1]; ++k) {
-              tmp = std::max(tmp,
-                             input[(i * data_reshape_[1] + k) *
-                                 data_reshape_[2] + j]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+              for (int k = 1; k < data_reshape_[1]; ++k) {
+                tmp = std::max(tmp,
+                               input[(i * data_reshape_[1] + k) *
+                                   data_reshape_[2] + j]);
+              }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
-            for (int k = 1; k < data_reshape_[1]; ++k) {
-              tmp *= input[(i * data_reshape_[1] + k) *
-                                data_reshape_[2] + j];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              T tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+              for (int k = 1; k < data_reshape_[1]; ++k) {
+                tmp *= input[(i * data_reshape_[1] + k) *
+                    data_reshape_[2] + j];
+              }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[0], 1);
     }
   }
 
-  void Reduce4Dims(const T *input, ReduceType type, T *output) {
+  void Reduce4Dims(const OpContext *context,
+                   const T *input,
+                   ReduceType type,
+                   T *output) {
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     if (reduce_first_axis_) {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          for (int j = 0; j < data_reshape_[3]; ++j) {
-            for (int k = 0; k < data_reshape_[2]; ++k) {
-              for (int t = 0; t < data_reshape_[0]; ++t) {
-                output[i * data_reshape_[3] + j] +=
-                    input[((t * data_reshape_[1] + i) *
-                        data_reshape_[2] + k)*data_reshape_[3] + j];
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              for (int k = 0; k < data_reshape_[2]; ++k) {
+                for (int t = 0; t < data_reshape_[0]; ++t) {
+                  output[i * data_reshape_[3] + j] +=
+                      input[((t * data_reshape_[1] + i) *
+                          data_reshape_[2] + k) * data_reshape_[3] + j];
+                }
               }
+              output[i * data_reshape_[3] + j] /=
+                  (data_reshape_[0] * data_reshape_[2]);
             }
-            output[i * data_reshape_[3] + j] /=
-                (data_reshape_[0] * data_reshape_[2]);
           }
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          for (int j = 0; j < data_reshape_[3]; ++j) {
-            T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
-            for (int k = 0; k < data_reshape_[2]; ++k) {
-              for (int t = 0; t < data_reshape_[0]; ++t) {
-                tmp = std::min(tmp,
-                               input[((t * data_reshape_[1] + i) *
-                        data_reshape_[2] + k)*data_reshape_[3] + j]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+              for (int k = 0; k < data_reshape_[2]; ++k) {
+                for (int t = 0; t < data_reshape_[0]; ++t) {
+                  tmp = std::min(tmp,
+                                 input[((t * data_reshape_[1] + i) *
+                                     data_reshape_[2] + k) * data_reshape_[3]
+                                     + j]);
+                }
               }
+              output[i * data_reshape_[3] + j] = tmp;
             }
-            output[i * data_reshape_[3] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          for (int j = 0; j < data_reshape_[3]; ++j) {
-            T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
-            for (int k = 0; k < data_reshape_[2]; ++k) {
-              for (int t = 0; t < data_reshape_[0]; ++t) {
-                tmp = std::max(tmp,
-                               input[((t * data_reshape_[1] + i) *
-                                   data_reshape_[2] + k)*data_reshape_[3] + j]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+              for (int k = 0; k < data_reshape_[2]; ++k) {
+                for (int t = 0; t < data_reshape_[0]; ++t) {
+                  tmp = std::max(tmp,
+                                 input[((t * data_reshape_[1] + i) *
+                                     data_reshape_[2] + k) * data_reshape_[3]
+                                     + j]);
+                }
               }
+              output[i * data_reshape_[3] + j] = tmp;
             }
-            output[i * data_reshape_[3] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[1]; ++i) {
-          for (int j = 0; j < data_reshape_[3]; ++j) {
-            T tmp = 1;
-            for (int k = 0; k < data_reshape_[2]; ++k) {
-              for (int t = 0; t < data_reshape_[0]; ++t) {
-                tmp = tmp * input[((t * data_reshape_[1] + i) *
-                                   data_reshape_[2] + k)*data_reshape_[3] + j];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = 1;
+              for (int k = 0; k < data_reshape_[2]; ++k) {
+                for (int t = 0; t < data_reshape_[0]; ++t) {
+                  tmp = tmp * input[((t * data_reshape_[1] + i) *
+                      data_reshape_[2] + k) * data_reshape_[3] + j];
+                }
               }
+              output[i * data_reshape_[3] + j] = tmp;
             }
-            output[i * data_reshape_[3] + j] = tmp;
           }
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[1], 1, 0, data_reshape_[3], 1);
     } else {
-      if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            for (int k = 0; k < data_reshape_[1]; ++k) {
-              for (int t = 0; t < data_reshape_[3]; ++t) {
-                output[i * data_reshape_[2] + j] +=
-                    input[((i * data_reshape_[1] + k) *
-                        data_reshape_[2] + j)*data_reshape_[3] + t];
+      thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                                index_t start1, index_t end1, index_t step1) {
+        if (type == ReduceType::MEAN) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                for (int t = 0; t < data_reshape_[3]; ++t) {
+                  output[i * data_reshape_[2] + j] +=
+                      input[((i * data_reshape_[1] + k) *
+                          data_reshape_[2] + j) * data_reshape_[3] + t];
+                }
               }
+              output[i * data_reshape_[2] + j] /=
+                  (data_reshape_[1] * data_reshape_[3]);
             }
-            output[i * data_reshape_[2] + j] /=
-                (data_reshape_[1] * data_reshape_[3]);
           }
-        }
-      } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = input[(i * data_reshape_[1] *
-                data_reshape_[2] + j)*data_reshape_[3]];
-            for (int k = 0; k < data_reshape_[1]; ++k) {
-              for (int t = 0; t < data_reshape_[3]; ++t) {
-                tmp =
-                    std::min(tmp,
-                             input[((i * data_reshape_[1] + k) *
-                        data_reshape_[2] + j)*data_reshape_[3] + t]);
+        } else if (type == ReduceType::MIN) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = input[(i * data_reshape_[1] *
+                  data_reshape_[2] + j) * data_reshape_[3]];
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                for (int t = 0; t < data_reshape_[3]; ++t) {
+                  tmp =
+                      std::min(tmp,
+                               input[((i * data_reshape_[1] + k) *
+                                   data_reshape_[2] + j) * data_reshape_[3]
+                                   + t]);
+                }
               }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = input[(i * data_reshape_[1] *
-                data_reshape_[2] + j)*data_reshape_[3]];
-            for (int k = 0; k < data_reshape_[1]; ++k) {
-              for (int t = 0; t < data_reshape_[3]; ++t) {
-                tmp =
-                    std::max(tmp,
-                             input[((i * data_reshape_[1] + k) *
-                                 data_reshape_[2] + j)*data_reshape_[3] + t]);
+        } else if (type == ReduceType::MAX) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = input[(i * data_reshape_[1] *
+                  data_reshape_[2] + j) * data_reshape_[3]];
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                for (int t = 0; t < data_reshape_[3]; ++t) {
+                  tmp =
+                      std::max(tmp,
+                               input[((i * data_reshape_[1] + k) *
+                                   data_reshape_[2] + j) * data_reshape_[3]
+                                   + t]);
+                }
               }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
-        }
-      } else if (type == ReduceType::PROD) {
-#pragma omp parallel for schedule(runtime)
-        for (int i = 0; i < data_reshape_[0]; ++i) {
-          for (int j = 0; j < data_reshape_[2]; ++j) {
-            T tmp = 1;
-            for (int k = 0; k < data_reshape_[1]; ++k) {
-              for (int t = 0; t < data_reshape_[3]; ++t) {
-                tmp = tmp * input[((i * data_reshape_[1] + k) *
-                    data_reshape_[2] + j)*data_reshape_[3] + t];
+        } else if (type == ReduceType::PROD) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              T tmp = 1;
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                for (int t = 0; t < data_reshape_[3]; ++t) {
+                  tmp = tmp * input[((i * data_reshape_[1] + k) *
+                      data_reshape_[2] + j) * data_reshape_[3] + t];
+                }
               }
+              output[i * data_reshape_[2] + j] = tmp;
             }
-            output[i * data_reshape_[2] + j] = tmp;
           }
+        } else {
+          MACE_NOT_IMPLEMENTED;
         }
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
+      }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1);
     }
   }
 
-  void Compute(const Tensor *input, Tensor *output) {
+  void Compute(const OpContext *context, const Tensor *input, Tensor *output) {
     Tensor::MappingGuard input_mapper(input);
     const T *input_ptr = input->data<T>();
     Tensor::MappingGuard output_map(output);
     T *output_ptr = output->mutable_data<T>();
     memset(output_ptr, 0, output->size() * sizeof(T));
     switch (data_reshape_.size()) {
-      case 1:
-        Reduce1Dims(input_ptr, reduce_type_, output_ptr);
+      case 1:Reduce1Dims(context, input_ptr, reduce_type_, output_ptr);
         break;
-      case 2:
-        Reduce2Dims(input_ptr, reduce_type_, output_ptr);
+      case 2:Reduce2Dims(context, input_ptr, reduce_type_, output_ptr);
         break;
-      case 3:
-        Reduce3Dims(input_ptr, reduce_type_, output_ptr);
+      case 3:Reduce3Dims(context, input_ptr, reduce_type_, output_ptr);
         break;
-      case 4:
-        Reduce4Dims(input_ptr, reduce_type_, output_ptr);
+      case 4:Reduce4Dims(context, input_ptr, reduce_type_, output_ptr);
         break;
-      default:
-        MACE_CHECK(false, "not implemented in mace")
+      default:MACE_CHECK(false, "not implemented in mace")
           << "data reshape size" << data_reshape_.size()
           << "reduce first axis:" << reduce_first_axis_;
         break;
@@ -537,9 +549,11 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
 };
 
 #ifdef MACE_ENABLE_QUANTIZE
-template <>
+template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
+    const OpContext *context,
     const uint8_t *input, ReduceType type, uint8_t *output) {
+  MACE_UNUSED(context);
   if (reduce_first_axis_) {
     if (type == ReduceType::MEAN) {
       uint32_t tmp = 0;
@@ -568,275 +582,286 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
   }
 }
 
-template <>
+template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
+    const OpContext *context,
     const uint8_t *input, ReduceType type, uint8_t *output) {
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
   if (reduce_first_axis_) {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint32_t tmp = 0;
-        for (int j = 0; j < data_reshape_[0]; ++j) {
-          tmp += input[j * data_reshape_[1] + i];
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[0]; ++j) {
+            tmp += input[j * data_reshape_[1] + i];
+          }
+          output[i] = static_cast<uint8_t>(
+              (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
         }
-        output[i] = static_cast<uint8_t>(
-            (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint8_t tmp = input[i];
-        for (int j = 1; j < data_reshape_[0]; ++j) {
-          tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i];
+          for (int j = 1; j < data_reshape_[0]; ++j) {
+            tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+          }
+          output[i] = tmp;
         }
-        output[i] = tmp;
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint8_t tmp = input[i];
-        for (int j = 1; j < data_reshape_[0]; ++j) {
-          tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i];
+          for (int j = 1; j < data_reshape_[0]; ++j) {
+            tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+          }
+          output[i] = tmp;
         }
-        output[i] = tmp;
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[1], 1);
   } else {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        uint32_t tmp = 0;
-        for (int j = 0; j < data_reshape_[1]; ++j) {
-          tmp += input[i * data_reshape_[1] + j];
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[1]; ++j) {
+            tmp += input[i * data_reshape_[1] + j];
+          }
+          output[i] = static_cast<uint8_t>(
+              (tmp + data_reshape_[1] / 2) / data_reshape_[1]);
         }
-        output[i] = static_cast<uint8_t>(
-            (tmp + data_reshape_[1] / 2) / data_reshape_[1]);
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        uint8_t tmp = input[i * data_reshape_[1]];
-        for (int j = 1; j < data_reshape_[1]; ++j) {
-          tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i * data_reshape_[1]];
+          for (int j = 1; j < data_reshape_[1]; ++j) {
+            tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+          }
+          output[i] = tmp;
         }
-        output[i] = tmp;
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        uint8_t tmp = input[i * data_reshape_[1]];
-        for (int j = 1; j < data_reshape_[1]; ++j) {
-          tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i * data_reshape_[1]];
+          for (int j = 1; j < data_reshape_[1]; ++j) {
+            tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+          }
+          output[i] = tmp;
         }
-        output[i] = tmp;
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[0], 1);
   }
 }
 
-template <>
+template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
+    const OpContext *context,
     const uint8_t *input, ReduceType type, uint8_t *output) {
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
   if (reduce_first_axis_) {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint32_t tmp = 0;
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          for (int k = 0; k < data_reshape_[0]; ++k) {
-            tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[2]; ++j) {
+            for (int k = 0; k < data_reshape_[0]; ++k) {
+              tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+            }
           }
+          index_t dim = data_reshape_[0] * data_reshape_[2];
+          output[i] = static_cast<uint8_t>((tmp + dim / 2) / dim);
         }
-        index_t dim = data_reshape_[0] * data_reshape_[2];
-        output[i] = static_cast<uint8_t>((tmp + dim / 2) / dim);
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint8_t tmp = input[i * data_reshape_[2]];
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          for (int k = 0; k < data_reshape_[0]; ++k) {
-            tmp = std::min(tmp,
-                           input[(k * data_reshape_[1] + i) * data_reshape_[2]
-                               + j]);
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i * data_reshape_[2]];
+          for (int j = 0; j < data_reshape_[2]; ++j) {
+            for (int k = 0; k < data_reshape_[0]; ++k) {
+              tmp = std::min(tmp,
+                             input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                                 + j]);
+            }
           }
+          output[i] = tmp;
         }
-        output[i] = tmp;
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(1) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        uint8_t tmp = input[i * data_reshape_[2]];
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          for (int k = 0; k < data_reshape_[0]; ++k) {
-            tmp =
-                std::max(tmp,
-                         input[(k * data_reshape_[1] + i)
-                             * data_reshape_[2] + j]);
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start; i < end; i += step) {
+          uint8_t tmp = input[i * data_reshape_[2]];
+          for (int j = 0; j < data_reshape_[2]; ++j) {
+            for (int k = 0; k < data_reshape_[0]; ++k) {
+              tmp =
+                  std::max(tmp,
+                           input[(k * data_reshape_[1] + i)
+                               * data_reshape_[2] + j]);
+            }
           }
+          output[i] = tmp;
         }
-        output[i] = tmp;
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[1], 1);
   } else {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint32_t tmp = 0;
-          for (int k = 0; k < data_reshape_[1]; ++k) {
-            tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+            }
+            output[i * data_reshape_[2] + j] =
+                static_cast<uint8_t>((tmp + data_reshape_[1] / 2) /
+                    data_reshape_[1]);
           }
-          output[i * data_reshape_[2] + j] =
-              static_cast<uint8_t>((tmp + data_reshape_[1] / 2) /
-                  data_reshape_[1]);
         }
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
-          for (int k = 1; k < data_reshape_[1]; ++k) {
-            tmp = std::min(tmp,
-                           input[(i * data_reshape_[1] + k) *
-                               data_reshape_[2] + j]);
-          }
-          output[i * data_reshape_[2] + j] = tmp;
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+            for (int k = 1; k < data_reshape_[1]; ++k) {
+              tmp = std::min(tmp,
+                             input[(i * data_reshape_[1] + k) *
+                                 data_reshape_[2] + j]);
+            }
+            output[i * data_reshape_[2] + j] = tmp;
+          }
         }
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
-          for (int k = 1; k < data_reshape_[1]; ++k) {
-            tmp = std::max(tmp,
-                           input[(i * data_reshape_[1] + k) *
-                               data_reshape_[2] + j]);
-          }
-          output[i * data_reshape_[2] + j] = tmp;
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+            for (int k = 1; k < data_reshape_[1]; ++k) {
+              tmp = std::max(tmp,
+                             input[(i * data_reshape_[1] + k) *
+                                 data_reshape_[2] + j]);
+            }
+            output[i * data_reshape_[2] + j] = tmp;
+          }
         }
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1);
   }
 }
 
-template <>
+template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
+    const OpContext *context,
     const uint8_t *input, ReduceType type, uint8_t *output) {
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
   if (reduce_first_axis_) {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        for (int j = 0; j < data_reshape_[3]; ++j) {
-          uint32_t tmp = 0;
-          for (int k = 0; k < data_reshape_[2]; ++k) {
-            for (int t = 0; t < data_reshape_[0]; ++t) {
-              tmp += input[((t * data_reshape_[1] + i) *
-                  data_reshape_[2] + k)*data_reshape_[3] + j];
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[2]; ++k) {
+              for (int t = 0; t < data_reshape_[0]; ++t) {
+                tmp += input[((t * data_reshape_[1] + i) *
+                    data_reshape_[2] + k) * data_reshape_[3] + j];
+              }
             }
+            index_t dim = data_reshape_[0] * data_reshape_[2];
+            output[i * data_reshape_[3] + j] =
+                static_cast<uint8_t>((tmp + dim / 2) / dim);
           }
-          index_t dim = data_reshape_[0] * data_reshape_[2];
-          output[i * data_reshape_[3] + j] =
-              static_cast<uint8_t>((tmp + dim / 2) / dim);
         }
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        for (int j = 0; j < data_reshape_[3]; ++j) {
-          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
-          for (int k = 0; k < data_reshape_[2]; ++k) {
-            for (int t = 0; t < data_reshape_[0]; ++t) {
-              tmp = std::min(tmp,
-                             input[((t * data_reshape_[1] + i) *
-                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+            for (int k = 0; k < data_reshape_[2]; ++k) {
+              for (int t = 0; t < data_reshape_[0]; ++t) {
+                tmp = std::min(tmp,
+                               input[((t * data_reshape_[1] + i) *
+                                   data_reshape_[2] + k) * data_reshape_[3]
+                                   + j]);
+              }
             }
+            output[i * data_reshape_[3] + j] = tmp;
           }
-          output[i * data_reshape_[3] + j] = tmp;
         }
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[1]; ++i) {
-        for (int j = 0; j < data_reshape_[3]; ++j) {
-          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
-          for (int k = 0; k < data_reshape_[2]; ++k) {
-            for (int t = 0; t < data_reshape_[0]; ++t) {
-              tmp = std::max(tmp,
-                             input[((t * data_reshape_[1] + i) *
-                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+            for (int k = 0; k < data_reshape_[2]; ++k) {
+              for (int t = 0; t < data_reshape_[0]; ++t) {
+                tmp = std::max(tmp,
+                               input[((t * data_reshape_[1] + i) *
+                                   data_reshape_[2] + k) * data_reshape_[3]
+                                   + j]);
+              }
             }
+            output[i * data_reshape_[3] + j] = tmp;
           }
-          output[i * data_reshape_[3] + j] = tmp;
         }
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[1], 1, 0, data_reshape_[3], 1);
   } else {
-    if (type == ReduceType::MEAN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint32_t tmp = 0;
-          for (int k = 0; k < data_reshape_[1]; ++k) {
-            for (int t = 0; t < data_reshape_[3]; ++t) {
-              tmp += input[((i * data_reshape_[1] + k) *
-                  data_reshape_[2] + j)*data_reshape_[3] + t];
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      if (type == ReduceType::MEAN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              for (int t = 0; t < data_reshape_[3]; ++t) {
+                tmp += input[((i * data_reshape_[1] + k) *
+                    data_reshape_[2] + j) * data_reshape_[3] + t];
+              }
             }
+            index_t dim = data_reshape_[1] * data_reshape_[3];
+            output[i * data_reshape_[2] + j] =
+                static_cast<uint8_t>((tmp + dim / 2) / dim);
           }
-          index_t dim = data_reshape_[1] * data_reshape_[3];
-          output[i * data_reshape_[2] + j] =
-              static_cast<uint8_t>((tmp + dim / 2) / dim);
         }
-      }
-    } else if (type == ReduceType::MIN) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint8_t tmp = input[(i * data_reshape_[1] *
-              data_reshape_[2] + j)*data_reshape_[3]];
-          for (int k = 0; k < data_reshape_[1]; ++k) {
-            for (int t = 0; t < data_reshape_[3]; ++t) {
-              tmp =
-                  std::min(tmp,
-                           input[((i * data_reshape_[1] + k) *
-                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+      } else if (type == ReduceType::MIN) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[(i * data_reshape_[1] *
+                data_reshape_[2] + j) * data_reshape_[3]];
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              for (int t = 0; t < data_reshape_[3]; ++t) {
+                tmp =
+                    std::min(tmp,
+                             input[((i * data_reshape_[1] + k) *
+                                 data_reshape_[2] + j) * data_reshape_[3] + t]);
+              }
             }
+            output[i * data_reshape_[2] + j] = tmp;
           }
-          output[i * data_reshape_[2] + j] = tmp;
         }
-      }
-    } else if (type == ReduceType::MAX) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-      for (int i = 0; i < data_reshape_[0]; ++i) {
-        for (int j = 0; j < data_reshape_[2]; ++j) {
-          uint8_t tmp = input[(i * data_reshape_[1] *
-              data_reshape_[2] + j)*data_reshape_[3]];
-          for (int k = 0; k < data_reshape_[1]; ++k) {
-            for (int t = 0; t < data_reshape_[3]; ++t) {
-              tmp =
-                  std::max(tmp,
-                           input[((i * data_reshape_[1] + k) *
-                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+      } else if (type == ReduceType::MAX) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint8_t tmp = input[(i * data_reshape_[1] *
+                data_reshape_[2] + j) * data_reshape_[3]];
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              for (int t = 0; t < data_reshape_[3]; ++t) {
+                tmp =
+                    std::max(tmp,
+                             input[((i * data_reshape_[1] + k) *
+                                 data_reshape_[2] + j) * data_reshape_[3] + t]);
+              }
             }
+            output[i * data_reshape_[2] + j] = tmp;
           }
-          output[i * data_reshape_[2] + j] = tmp;
         }
+      } else {
+        MACE_NOT_IMPLEMENTED;
       }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    }, 0, data_reshape_[0], 1, 0, data_reshape_[2], 1);
   }
 }
 #endif  // MACE_ENABLE_QUANTIZE
diff --git a/mace/ops/ref/activation.cc b/mace/ops/ref/activation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e2e65dbe71ef5b0e243a2be7d7803028de1f8d8
--- /dev/null
+++ b/mace/ops/ref/activation.cc
@@ -0,0 +1,104 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "mace/ops/ref/activation.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+Activation::Activation(ActivationType type,
+                       const float limit,
+                       const float leakyrelu_coefficient)
+    : type_(type),
+      limit_(limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+
+MaceStatus Activation::Compute(const OpContext *context,
+                               const Tensor *input,
+                               Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    Tensor::MappingGuard output_guard(output);
+    DoActivation(context, input, output);
+  } else {
+    DoActivation(context, input, output);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void Activation::DoActivation(const OpContext *context,
+                              const Tensor *input,
+                              Tensor *output) {
+  MACE_UNUSED(context);
+  auto input_ptr = input->data<float>();
+  auto output_ptr = output->mutable_data<float>();
+  const index_t size = input->size();
+
+  switch (type_) {
+    case RELU: {
+      for (index_t i = 0; i < size; ++i) {
+        *output_ptr++ = std::max(0.f, *input_ptr++);
+      }
+
+      break;
+    }
+
+    case RELUX: {
+      for (index_t i = 0; i < size; ++i) {
+        *output_ptr++ = std::max(0.f, std::min(limit_, *input_ptr++));
+      }
+
+      break;
+    }
+
+    case LEAKYRELU: {
+      for (index_t i = 0; i < size; ++i) {
+        *output_ptr =
+            std::max(*input_ptr, 0.f)
+                + std::min(*input_ptr, 0.f) * leakyrelu_coefficient_;
+        ++input_ptr;
+        ++output_ptr;
+      }
+
+      break;
+    }
+
+    case TANH: {
+      for (index_t i = 0; i < size; ++i) {
+        *output_ptr++ = std::tanh(*input_ptr++);
+      }
+
+      break;
+    }
+
+    case SIGMOID: {
+      for (index_t i = 0; i < size; ++i) {
+        *output_ptr++ = 1 / (1 + std::exp(-(*input_ptr++)));
+      }
+      break;
+    }
+
+    case NOOP:break;
+
+    default:MACE_NOT_IMPLEMENTED;
+  }
+}
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/ref/activation.h b/mace/ops/ref/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ad986a50ceed14b021abf2a4d81f2bb7b336e19
--- /dev/null
+++ b/mace/ops/ref/activation.h
@@ -0,0 +1,51 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_REF_ACTIVATION_H_
+#define MACE_OPS_REF_ACTIVATION_H_
+
+#include "mace/core/op_context.h"
+#include "mace/ops/common/activation_type.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+class Activation {
+ public:
+  explicit Activation(ActivationType type,
+                      const float limit,
+                      const float leakyrelu_coefficient);
+  ~Activation() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      Tensor *output);
+
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input,
+                    Tensor *output);
+
+  ActivationType type_;
+  const float limit_;
+  const float leakyrelu_coefficient_;
+};
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_REF_ACTIVATION_H_
diff --git a/mace/ops/ref/bias_add.cc b/mace/ops/ref/bias_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3be25b08d070ac2791fe971325cce57d96de831
--- /dev/null
+++ b/mace/ops/ref/bias_add.cc
@@ -0,0 +1,76 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ref/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+MaceStatus BiasAdd::Compute(const OpContext *context,
+                            const Tensor *input,
+                            const Tensor *bias,
+                            Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  Tensor::MappingGuard bias_guard(bias);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    if (bias == nullptr) {
+      output->Copy(*input);
+    } else {
+      Tensor::MappingGuard output_guard(output);
+      AddBias(context, input, bias, output);
+    }
+  } else {
+    if (bias != nullptr) {
+      AddBias(context, input, bias, output);
+    }
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void BiasAdd::AddBias(const OpContext *context,
+                      const Tensor *input,
+                      const Tensor *bias,
+                      mace::Tensor *output) {
+  MACE_UNUSED(context);
+  auto input_data = input->data<float>();
+  auto bias_data = bias->data<float>();
+  auto output_data = output->mutable_data<float>();
+
+  const index_t batch = input->dim(0);
+  const index_t channels = input->dim(1);
+  const index_t height = output->dim(2);
+  const index_t width = output->dim(3);
+  const index_t image_size = height * width;
+
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t offset = (b * channels + c) * image_size;
+      auto input_ptr = input_data + offset;
+      auto output_ptr = output_data + offset;
+      const float bias = bias_data[c];
+
+      for (index_t i = 0; i < image_size; ++i) {
+        (*output_ptr++) = (*input_ptr++) + bias;
+      }
+    }
+  }
+}
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/activation_neon.h b/mace/ops/ref/bias_add.h
similarity index 56%
rename from mace/ops/arm/activation_neon.h
rename to mace/ops/ref/bias_add.h
index d640e689a2c1e91cb614826b9af1b53d7c90ef94..f3dc6096e0ae409d0a4b226ebd21b04d6e0228b5 100644
--- a/mace/ops/arm/activation_neon.h
+++ b/mace/ops/ref/bias_add.h
@@ -12,23 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_ACTIVATION_NEON_H_
-#define MACE_OPS_ARM_ACTIVATION_NEON_H_
+#ifndef MACE_OPS_REF_BIAS_ADD_H_
+#define MACE_OPS_REF_BIAS_ADD_H_
 
-#include "mace/core/types.h"
+#include "mace/core/op_context.h"
 
 namespace mace {
 namespace ops {
-
-void ReluNeon(const float *input, const index_t size, float *output);
-
-void ReluxNeon(const float *input, const float limit,
-               const index_t size, float *output);
-
-void LeakyReluNeon(const float *input, const float alpha,
-                   const index_t size, float *output);
-
+namespace ref {
+
+class BiasAdd {
+ public:
+  BiasAdd() = default;
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output);
+
+ private:
+  void AddBias(const OpContext *context,
+               const Tensor *input,
+               const Tensor *bias,
+               Tensor *output);
+};
+
+}  // namespace ref
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_ACTIVATION_NEON_H_
+#endif  // MACE_OPS_REF_BIAS_ADD_H_
diff --git a/mace/ops/ref/conv_2d.cc b/mace/ops/ref/conv_2d.cc
index e5b7952a334b8fb5bcc4d13d8264fc6f76d8c41d..1c69ee9d72e98dbb357347ed2d4e10d971e1cb07 100644
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
@@ -66,7 +66,6 @@ MaceStatus Conv2d<float>::Compute(const OpContext *context,
   auto filter_data = filter->data<float>();
   auto output_data = output->mutable_data<float>();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < in_shape[0]; b++) {
     for (index_t m = 0; m < filter_shape[0]; ++m) {
       const index_t in_height = in_shape[2];
diff --git a/mace/ops/ref/conv_2d.h b/mace/ops/ref/conv_2d.h
index c04eff0fdecef6579f8065f1eb91a0dfef60a8b2..9a9fbb8f92363fed058d9a96929714c8870ab028 100644
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
@@ -30,9 +30,9 @@ namespace ref {
 template<typename OUTPUT_TYPE>
 class Conv2d {
  public:
-  Conv2d(const std::vector<int> strides,
-         const std::vector<int> dilations,
-         const std::vector<int> paddings,
+  Conv2d(const std::vector<int> &strides,
+         const std::vector<int> &dilations,
+         const std::vector<int> &paddings,
          const Padding padding_type)
       : strides_(strides),
         dilations_(dilations),
@@ -55,9 +55,9 @@ class Conv2d {
 template<>
 class Conv2d<float> {
  public:
-  Conv2d(const std::vector<int> strides,
-         const std::vector<int> dilations,
-         const std::vector<int> paddings,
+  Conv2d(const std::vector<int> &strides,
+         const std::vector<int> &dilations,
+         const std::vector<int> &paddings,
          const Padding padding_type)
       : strides_(strides),
         dilations_(dilations),
diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6044af3b7fefa5e698bb6db02220832a8802af79
--- /dev/null
+++ b/mace/ops/ref/deconv_2d.cc
@@ -0,0 +1,167 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <utility>
+#include <memory>
+#include <functional>
+#include <vector>
+#include "mace/ops/ref/deconv_2d.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+MaceStatus Deconv2d<float>::Compute(const OpContext *context,
+                                    const Tensor *input,
+                                    const Tensor *filter,
+                                    const Tensor *output_shape,
+                                    Tensor *output) {
+  MACE_UNUSED(context);
+
+  std::vector<index_t> out_shape;
+  if (output_shape) {
+    Tensor::MappingGuard out_shape_guard(output_shape);
+    MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims");
+    out_shape =
+        std::vector<index_t>(output_shape->data<int32_t>(),
+                             output_shape->data<int32_t>() + 4);
+  }
+  std::vector<index_t> padded_out_shape;
+  std::vector<int> out_pad_size;
+  CalDeconvOutputShapeAndPadSize(input->shape(),
+                                 filter->shape(),
+                                 strides_,
+                                 padding_type_,
+                                 paddings_,
+                                 1,
+                                 &out_shape,
+                                 nullptr,
+                                 &out_pad_size,
+                                 &padded_out_shape,
+                                 framework_type_,
+                                 NCHW);
+
+  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+  const bool is_out_padded =
+      padded_out_shape[2] != out_shape[2]
+          || padded_out_shape[3] != out_shape[3];
+
+  std::unique_ptr<Tensor> padded_output(nullptr);
+  if (is_out_padded) {
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    index_t scratch_size = PadAlignSize(padded_out_size);
+    scratch->GrowSize(scratch_size);
+
+    std::unique_ptr<Tensor>
+        padded_out
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+    padded_out->Reshape(padded_out_shape);
+    padded_output = std::move(padded_out);
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto pad_out_data = out_tensor->mutable_data<float>();
+  auto out_data = output->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t pad_out_height = padded_out_shape[2];
+  const index_t pad_out_width = padded_out_shape[3];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_img_size = pad_out_height * pad_out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+  const int kernel_size = static_cast<int>(kernel_h * kernel_w);
+  const index_t pad_top = out_pad_size[0] / 2;
+  const index_t pad_left = out_pad_size[1] / 2;
+
+  std::vector<index_t> index_map(kernel_size, 0);
+  for (index_t i = 0; i < kernel_h; ++i) {
+    for (index_t j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * pad_out_width + j;
+    }
+  }
+
+  const index_t batch = in_shape[0];
+  const index_t out_channels = out_shape[1];
+  const index_t in_channels = in_shape[1];
+
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t oc = 0; oc < out_channels; ++oc) {
+      float *out_base =
+          pad_out_data + (b * out_channels + oc) * out_img_size;
+      for (index_t i = 0; i < in_height; ++i) {
+        for (index_t j = 0; j < in_width; ++j) {
+          const index_t out_offset =
+              i * strides_[0] * pad_out_width + j * strides_[1];
+          for (index_t ic = 0; ic < in_channels; ++ic) {
+            const index_t input_idx =
+                (b * in_channels + ic) * in_img_size + i * in_width + j;
+            const float val = input_data[input_idx];
+            const index_t kernel_offset =
+                (oc * in_channels + ic) * kernel_size;
+            for (int k = 0; k < kernel_size; ++k) {
+              const index_t out_idx = out_offset + index_map[k];
+              const index_t kernel_idx = kernel_offset + k;
+              out_base[out_idx] += val * filter_data[kernel_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+  if (out_tensor != output) {
+    for (index_t i = 0; i < batch; ++i) {
+      for (index_t j = 0; j < out_channels; ++j) {
+        for (index_t k = 0; k < out_height; ++k) {
+          const float *input_base =
+              pad_out_data
+                  + ((i * out_channels + j) * pad_out_height + (k + pad_top))
+                      * pad_out_width;
+          float *output_base =
+              out_data + ((i * out_channels + j) * out_height + k) * out_width;
+          memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
+        }
+      }
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/ref/deconv_2d.h b/mace/ops/ref/deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8ab6722b47037f2552faaea8d8cca5151f463ae
--- /dev/null
+++ b/mace/ops/ref/deconv_2d.h
@@ -0,0 +1,97 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_REF_DECONV_2D_H_
+#define MACE_OPS_REF_DECONV_2D_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+template<typename OUTPUT_TYPE>
+class Deconv2d {
+ public:
+  Deconv2d(const std::vector<int> &strides,
+           const std::vector<int> &dilations,
+           const std::vector<int> &paddings,
+           const Padding padding_type,
+           const FrameworkType framework_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type),
+        framework_type_(framework_type) {}
+
+  ~Deconv2d() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+};
+
+template<>
+class Deconv2d<float> {
+ public:
+  Deconv2d(const std::vector<int> &strides,
+           const std::vector<int> &dilations,
+           const std::vector<int> &paddings,
+           const Padding padding_type,
+           const FrameworkType framework_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type),
+        framework_type_(framework_type) {}
+
+  ~Deconv2d() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const FrameworkType framework_type_;
+};
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_REF_DECONV_2D_H_
+
diff --git a/mace/ops/ref/depthwise_conv_2d.cc b/mace/ops/ref/depthwise_conv_2d.cc
index b9f8b31f6ad517ae07ae15295dcc1f7688584861..bff950690d719103c31f4ddeb36a7cd934e256c3 100644
--- a/mace/ops/ref/depthwise_conv_2d.cc
+++ b/mace/ops/ref/depthwise_conv_2d.cc
@@ -69,7 +69,6 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
   auto filter_data = filter->data<float>();
   auto output_data = output->mutable_data<float>();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < in_shape[0]; b++) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       const index_t c = m / multiplier;
@@ -119,5 +118,3 @@ MaceStatus DepthwiseConv2d<float>::Compute(const OpContext *context,
 }  // namespace ref
 }  // namespace ops
 }  // namespace mace
-
-
diff --git a/mace/ops/ref/depthwise_conv_2d.h b/mace/ops/ref/depthwise_conv_2d.h
index ad493eb207ac8a8edaaada7589aa364d080e5b16..91a95192a43ba2cc97bc9cc08b9774e2fc6d0a8a 100644
--- a/mace/ops/ref/depthwise_conv_2d.h
+++ b/mace/ops/ref/depthwise_conv_2d.h
@@ -30,9 +30,9 @@ namespace ref {
 template<typename OUTPUT_TYPE>
 class DepthwiseConv2d {
  public:
-  DepthwiseConv2d(const std::vector<int> strides,
-                  const std::vector<int> dilations,
-                  const std::vector<int> paddings,
+  DepthwiseConv2d(const std::vector<int> &strides,
+                  const std::vector<int> &dilations,
+                  const std::vector<int> &paddings,
                   const Padding padding_type)
       : strides_(strides),
         dilations_(dilations),
@@ -55,9 +55,9 @@ class DepthwiseConv2d {
 template<>
 class DepthwiseConv2d<float> {
  public:
-  DepthwiseConv2d(const std::vector<int> strides,
-                  const std::vector<int> dilations,
-                  const std::vector<int> paddings,
+  DepthwiseConv2d(const std::vector<int> &strides,
+                  const std::vector<int> &dilations,
+                  const std::vector<int> &paddings,
                   const Padding padding_type)
       : strides_(strides),
         dilations_(dilations),
diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0da81faa60b5268d0effb3777669f9419483f77b
--- /dev/null
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
@@ -0,0 +1,307 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <utility>
+#include <memory>
+#include <functional>
+#include "mace/ops/ref/depthwise_deconv_2d.h"
+#include "mace/utils/memory.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
+                                             const Tensor *input,
+                                             const Tensor *filter,
+                                             const Tensor *output_shape,
+                                             Tensor *output) {
+  MACE_UNUSED(context);
+
+  std::vector<index_t> out_shape;
+  if (output_shape) {
+    Tensor::MappingGuard out_shape_guard(output_shape);
+    MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims");
+    out_shape =
+        std::vector<index_t>(output_shape->data<int32_t>(),
+                             output_shape->data<int32_t>() + 4);
+  }
+  std::vector<index_t> padded_out_shape;
+  std::vector<int> out_pad_size;
+  CalDeconvOutputShapeAndPadSize(input->shape(),
+                                 filter->shape(),
+                                 strides_,
+                                 padding_type_,
+                                 paddings_,
+                                 input->dim(1),
+                                 &out_shape,
+                                 nullptr,
+                                 &out_pad_size,
+                                 &padded_out_shape,
+                                 framework_type_,
+                                 NCHW);
+
+  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+  const bool is_out_padded =
+      padded_out_shape[2] != out_shape[2]
+          || padded_out_shape[3] != out_shape[3];
+
+  std::unique_ptr<Tensor> padded_output(nullptr);
+  if (is_out_padded) {
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    index_t scratch_size = PadAlignSize(padded_out_size);
+    scratch->GrowSize(scratch_size);
+
+    std::unique_ptr<Tensor>
+        padded_out
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+    padded_out->Reshape(padded_out_shape);
+    padded_output = std::move(padded_out);
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto pad_out_data = out_tensor->mutable_data<float>();
+  auto out_data = output->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t pad_out_height = padded_out_shape[2];
+  const index_t pad_out_width = padded_out_shape[3];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_img_size = pad_out_height * pad_out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+  const int kernel_size = static_cast<int>(kernel_h * kernel_w);
+  const index_t pad_top = out_pad_size[0] / 2;
+  const index_t pad_left = out_pad_size[1] / 2;
+
+  std::vector<int> index_map(kernel_size, 0);
+  for (int i = 0; i < kernel_h; ++i) {
+    for (int j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * pad_out_width + j;
+    }
+  }
+
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      float *out_base =
+          pad_out_data + (b * channels + c) * out_img_size;
+      for (index_t i = 0; i < in_height; ++i) {
+        for (index_t j = 0; j < in_width; ++j) {
+          const index_t out_offset =
+              i * strides_[0] * pad_out_width + j * strides_[1];
+          const index_t input_idx =
+              (b * channels + c) * in_img_size + i * in_width + j;
+          const float val = input_data[input_idx];
+          const index_t kernel_offset = c * kernel_size;
+          for (int k = 0; k < kernel_size; ++k) {
+            const index_t out_idx = out_offset + index_map[k];
+            const index_t kernel_idx = kernel_offset + k;
+            out_base[out_idx] += val * filter_data[kernel_idx];
+          }
+        }
+      }
+    }
+  }
+
+  if (out_tensor != output) {
+    for (index_t i = 0; i < batch; ++i) {
+      for (index_t j = 0; j < channels; ++j) {
+        for (index_t k = 0; k < out_height; ++k) {
+          const float *input_base =
+              pad_out_data
+                  + ((i * channels + j) * pad_out_height + (k + pad_top))
+                      * pad_out_width;
+          float *output_base =
+              out_data + ((i * channels + j) * out_height + k) * out_width;
+          memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
+        }
+      }
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
+                                         const Tensor *input,
+                                         const Tensor *filter,
+                                         const Tensor *output_shape,
+                                         Tensor *output) {
+  MACE_UNUSED(context);
+
+  std::vector<index_t> out_shape;
+  if (output_shape) {
+    Tensor::MappingGuard out_shape_guard(output_shape);
+    MACE_CHECK(output_shape->size() == 4, "output shape should be 4-dims");
+    out_shape =
+        std::vector<index_t>(output_shape->data<int32_t>(),
+                             output_shape->data<int32_t>() + 4);
+  }
+  std::vector<index_t> padded_out_shape;
+  std::vector<int> out_pad_size;
+  CalDeconvOutputShapeAndPadSize(input->shape(),
+                                 filter->shape(),
+                                 strides_,
+                                 padding_type_,
+                                 paddings_,
+                                 group_,
+                                 &out_shape,
+                                 nullptr,
+                                 &out_pad_size,
+                                 &padded_out_shape,
+                                 framework_type_,
+                                 NCHW);
+
+  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+  const bool is_out_padded =
+      padded_out_shape[2] != out_shape[2]
+          || padded_out_shape[3] != out_shape[3];
+
+  std::unique_ptr<Tensor> padded_output(nullptr);
+  if (is_out_padded) {
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    index_t scratch_size = PadAlignSize(padded_out_size);
+    scratch->GrowSize(scratch_size);
+
+    std::unique_ptr<Tensor>
+        padded_out
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+    padded_out->Reshape(padded_out_shape);
+    padded_output = std::move(padded_out);
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+
+  out_tensor->Clear();
+
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard output_mapper(output);
+
+  auto input_data = input->data<float>();
+  auto filter_data = filter->data<float>();
+  auto pad_out_data = out_tensor->mutable_data<float>();
+  auto out_data = output->mutable_data<float>();
+
+  auto &in_shape = input->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t pad_out_height = padded_out_shape[2];
+  const index_t pad_out_width = padded_out_shape[3];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_img_size = pad_out_height * pad_out_width;
+  const index_t in_img_size = in_height * in_width;
+  const index_t kernel_h = filter->dim(2);
+  const index_t kernel_w = filter->dim(3);
+  const int kernel_size = static_cast<int>(kernel_h * kernel_w);
+  const index_t pad_top = out_pad_size[0] / 2;
+  const index_t pad_left = out_pad_size[1] / 2;
+
+  std::vector<int> index_map(kernel_size, 0);
+  for (int i = 0; i < kernel_h; ++i) {
+    for (int j = 0; j < kernel_w; ++j) {
+      index_map[i * kernel_w + j] = i * out_width + j;
+    }
+  }
+
+  const int in_channels_g = in_channels / group_;
+  const int out_channels_g = out_channels / group_;
+  for (int b = 0; b < in_shape[0]; ++b) {
+    for (int g = 0; g < group_; ++g) {
+      for (int p = 0; p < out_channels_g; ++p) {
+        const index_t out_base =
+            ((b * group_ + g) * out_channels_g + p) * out_img_size;
+        for (int i = 0; i < in_height; ++i) {
+          for (int j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides_[0] * out_width + j * strides_[1];
+            for (int q = 0; q < in_channels_g; ++q) {
+              const index_t in_base =
+                  ((b * group_ + g) * in_channels_g + q) * in_img_size;
+              const index_t in_offset =
+                  in_base + i * in_width + j;
+              const float val = input_data[in_offset];
+              const index_t k_offset =
+                  ((p * group_ + g) * in_channels_g + q) * kernel_size;
+              for (int k = 0; k < kernel_size; ++k) {
+                const index_t out_idx = out_base + out_offset + index_map[k];
+                const float w = filter_data[k_offset + k];
+                pad_out_data[out_idx] += val * w;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (out_tensor != output) {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < out_channels; ++j) {
+        for (int k = 0; k < out_height; ++k) {
+          const float *input_base =
+              pad_out_data
+                  + ((i * out_channels + j) * pad_out_height + (k + pad_top))
+                      * pad_out_width;
+          float *output_base =
+              out_data + ((i * out_channels + j) * out_height + k) * out_width;
+          memcpy(output_base, input_base + pad_left, out_width * sizeof(float));
+        }
+      }
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/ref/depthwise_deconv_2d.h b/mace/ops/ref/depthwise_deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..5da7487192a3762e6219716969a826e3f602a85a
--- /dev/null
+++ b/mace/ops/ref/depthwise_deconv_2d.h
@@ -0,0 +1,153 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
+#define MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+
+namespace mace {
+namespace ops {
+namespace ref {
+
+template<typename OUTPUT_TYPE>
+class GroupDeconv2d {
+ public:
+  GroupDeconv2d(const std::vector<int> &strides,
+                const std::vector<int> &dilations,
+                const std::vector<int> &paddings,
+                const Padding padding_type,
+                const index_t group,
+                const FrameworkType framework_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type),
+        group_(group),
+        framework_type_(framework_type) {}
+
+  virtual ~GroupDeconv2d() = default;
+
+  virtual MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+
+ private:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const index_t group_;
+  const FrameworkType framework_type_;
+};
+
+template<typename OUTPUT_TYPE>
+class DepthwiseDeconv2d : public GroupDeconv2d<OUTPUT_TYPE> {
+ public:
+  DepthwiseDeconv2d(const std::vector<int> &strides,
+                    const std::vector<int> &dilations,
+                    const std::vector<int> &paddings,
+                    const Padding padding_type,
+                    const FrameworkType framework_type)
+      : GroupDeconv2d<OUTPUT_TYPE>(strides,
+                                   dilations,
+                                   paddings,
+                                   padding_type,
+                                   0,
+                                   framework_type) {}
+
+  ~DepthwiseDeconv2d() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+};
+
+template<>
+class GroupDeconv2d<float> {
+ public:
+  GroupDeconv2d(const std::vector<int> &strides,
+                const std::vector<int> &dilations,
+                const std::vector<int> &paddings,
+                const Padding padding_type,
+                const index_t group,
+                const FrameworkType framework_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type),
+        group_(group),
+        framework_type_(framework_type) {}
+
+  virtual ~GroupDeconv2d() = default;
+
+  virtual MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+
+ protected:
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
+  const index_t group_;
+  const FrameworkType framework_type_;
+};
+
+template<>
+class DepthwiseDeconv2d<float> : public GroupDeconv2d<float> {
+ public:
+  DepthwiseDeconv2d(const std::vector<int> &strides,
+                    const std::vector<int> &dilations,
+                    const std::vector<int> &paddings,
+                    const Padding padding_type,
+                    const FrameworkType framework_type)
+      : GroupDeconv2d<float>(strides,
+                             dilations,
+                             paddings,
+                             padding_type,
+                             0,
+                             framework_type) {}
+
+  ~DepthwiseDeconv2d() = default;
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output);
+};
+
+}  // namespace ref
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_REF_DEPTHWISE_DECONV_2D_H_
+
diff --git a/mace/ops/ref/gemv.cc b/mace/ops/ref/gemv.cc
index 59fc31dc3e80f5e63084aa41fc9337b49a4cba86..bf0366f3ce8cab2c848172b511cdfb98d1cb9d27 100644
--- a/mace/ops/ref/gemv.cc
+++ b/mace/ops/ref/gemv.cc
@@ -16,7 +16,7 @@
 #include "mace/ops/ref/gemv.h"
 
 #if defined(MACE_ENABLE_QUANTIZE)
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 namespace mace {
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 236e670f1d26b97471e219ba746102d777a008b5..f06692b9711c87e04e710eaaa2c1bce39f44f38f 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -77,7 +77,8 @@ inline float Interpolate1D(const std::vector<float> &weights,
       values[2] * weights[2] + values[3] * weights[3];
 }
 
-inline void ResizeImage(const float *images,
+inline void ResizeImage(const OpContext *context,
+                        const float *images,
                         const index_t batch_size,
                         const index_t in_height,
                         const index_t in_width,
@@ -87,47 +88,52 @@ inline void ResizeImage(const float *images,
                         const float height_scale,
                         const float width_scale,
                         float *output) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t y = 0; y < out_height; ++y) {
-      std::vector<float> y_weights;
-      std::vector<index_t> y_indices;
-      GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
-                           &y_indices);
-      for (index_t x = 0; x < out_width; ++x) {
-        std::vector<float> x_weights;
-        std::vector<index_t> x_indices;
-        GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
-                             &x_indices);
-
-        for (index_t c = 0; c < channels; ++c) {
-          // Use a 4x4 patch to compute the interpolated output value at
-          // (b, y, x, c).
-          const float *channel_input_ptr =
-              images + (b * channels + c) * in_height * in_width;
-          float *channel_output_ptr =
-              output + (b * channels + c) * out_height * out_width;
-          std::vector<float> coeff(4, 0.0);
-          for (index_t i = 0; i < 4; ++i) {
-            const std::vector<float> values = {
-                channel_input_ptr[y_indices[i] * in_width + x_indices[0]],
-                channel_input_ptr[y_indices[i] * in_width + x_indices[1]],
-                channel_input_ptr[y_indices[i] * in_width + x_indices[2]],
-                channel_input_ptr[y_indices[i] * in_width + x_indices[3]]};
-            coeff[i] = Interpolate1D(x_weights, values);
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t y = start1; y < end1; y += step1) {
+        std::vector<float> y_weights;
+        std::vector<index_t> y_indices;
+        GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
+                             &y_indices);
+        for (index_t x = 0; x < out_width; ++x) {
+          std::vector<float> x_weights;
+          std::vector<index_t> x_indices;
+          GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
+                               &x_indices);
+
+          for (index_t c = 0; c < channels; ++c) {
+            // Use a 4x4 patch to compute the interpolated output value at
+            // (b, y, x, c).
+            const float *channel_input_ptr =
+                images + (b * channels + c) * in_height * in_width;
+            float *channel_output_ptr =
+                output + (b * channels + c) * out_height * out_width;
+            std::vector<float> coeff(4, 0.0);
+            for (index_t i = 0; i < 4; ++i) {
+              const std::vector<float> values = {
+                  channel_input_ptr[y_indices[i] * in_width + x_indices[0]],
+                  channel_input_ptr[y_indices[i] * in_width + x_indices[1]],
+                  channel_input_ptr[y_indices[i] * in_width + x_indices[2]],
+                  channel_input_ptr[y_indices[i] * in_width + x_indices[3]]};
+              coeff[i] = Interpolate1D(x_weights, values);
+            }
+            channel_output_ptr[y * out_width + x] =
+                Interpolate1D(y_weights, coeff);
           }
-          channel_output_ptr[y * out_width + x] =
-              Interpolate1D(y_weights, coeff);
         }
       }
     }
-  }
+  }, 0, batch_size, 1, 0, out_height, 1);
 }
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ResizeBicubicOp;
 
-template <>
+template<>
 class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit ResizeBicubicOp(OpConstructContext *context)
@@ -175,8 +181,17 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
                                              out_width,
                                              align_corners_);
 
-    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
-                channels, height_scale, width_scale, output_data);
+    ResizeImage(context,
+                input_data,
+                batch,
+                in_height,
+                in_width,
+                out_height,
+                out_width,
+                channels,
+                height_scale,
+                width_scale,
+                output_data);
 
     return MaceStatus::MACE_SUCCESS;
   }
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 46720b3c29d32d01f82902a0bfcc49071aa6aa2a..1fe13f42b2ee20258fb55634746b85f492eea70e 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -20,7 +20,7 @@
 
 #include "mace/core/operator.h"
 #include "mace/utils/memory.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bilinear.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -51,7 +51,7 @@ inline void ComputeInterpolationWeights(
   }
 }
 
-template <typename T>
+template<typename T>
 inline T ComputeLerp(const T top_left,
                      const T top_right,
                      const T bottom_left,
@@ -59,7 +59,7 @@ inline T ComputeLerp(const T top_left,
                      const float x_lerp,
                      const float y_lerp);
 
-template <>
+template<>
 inline float ComputeLerp<float>(const float top_left,
                                 const float top_right,
                                 const float bottom_left,
@@ -71,7 +71,7 @@ inline float ComputeLerp<float>(const float top_left,
   return top + (bottom - top) * y_lerp;
 }
 
-template <>
+template<>
 inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
                                     const uint8_t top_right,
                                     const uint8_t bottom_left,
@@ -83,8 +83,9 @@ inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
   return Saturate<uint8_t>(roundf(top + (bottom - top) * y_lerp));
 }
 
-template <typename T>
-inline void ResizeImageNCHW(const T *images,
+template<typename T>
+inline void ResizeImageNCHW(const OpContext *context,
+                            const T *images,
                             const index_t batch_size,
                             const index_t in_height,
                             const index_t in_width,
@@ -96,38 +97,44 @@ inline void ResizeImageNCHW(const T *images,
                             T *output) {
   const CachedInterpolation *xs = xs_vec.data();
 
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const T
-          *channel_input_ptr =
-          images + (b * channels + c) * in_height * in_width;
-      T *channel_output_ptr =
-          output + (b * channels + c) * out_height * out_width;
-      for (index_t y = 0; y < out_height; ++y) {
-        const T *y_lower_input_ptr =
-            channel_input_ptr + ys[y].lower * in_width;
-        const T *y_upper_input_ptr =
-            channel_input_ptr + ys[y].upper * in_width;
-        const float ys_lerp = ys[y].lerp;
-
-        for (index_t x = 0; x < out_width; ++x) {
-          const float xs_lerp = xs[x].lerp;
-          const T top_left = y_lower_input_ptr[xs[x].lower];
-          const T top_right = y_lower_input_ptr[xs[x].upper];
-          const T bottom_left = y_upper_input_ptr[xs[x].lower];
-          const T bottom_right = y_upper_input_ptr[xs[x].upper];
-          channel_output_ptr[y * out_width + x] =
-              ComputeLerp(top_left, top_right, bottom_left,
-                          bottom_right, xs_lerp, ys_lerp);
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const T
+            *channel_input_ptr =
+            images + (b * channels + c) * in_height * in_width;
+        T *channel_output_ptr =
+            output + (b * channels + c) * out_height * out_width;
+        for (index_t y = 0; y < out_height; ++y) {
+          const T *y_lower_input_ptr =
+              channel_input_ptr + ys[y].lower * in_width;
+          const T *y_upper_input_ptr =
+              channel_input_ptr + ys[y].upper * in_width;
+          const float ys_lerp = ys[y].lerp;
+
+          for (index_t x = 0; x < out_width; ++x) {
+            const float xs_lerp = xs[x].lerp;
+            const T top_left = y_lower_input_ptr[xs[x].lower];
+            const T top_right = y_lower_input_ptr[xs[x].upper];
+            const T bottom_left = y_upper_input_ptr[xs[x].lower];
+            const T bottom_right = y_upper_input_ptr[xs[x].upper];
+            channel_output_ptr[y * out_width + x] =
+                ComputeLerp(top_left, top_right, bottom_left,
+                            bottom_right, xs_lerp, ys_lerp);
+          }
         }
       }
     }
-  }
+  }, 0, batch_size, 1, 0, channels, 1);
 }
 
-template <typename T>
-inline void ResizeImageNHWC(const T *images,
+template<typename T>
+inline void ResizeImageNHWC(const OpContext *context,
+                            const T *images,
                             const index_t batch_size,
                             const index_t in_height,
                             const index_t in_width,
@@ -139,39 +146,44 @@ inline void ResizeImageNHWC(const T *images,
                             T *output) {
   const CachedInterpolation *xs = xs_vec.data();
 
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
   for (index_t b = 0; b < batch_size; ++b) {
     const T *input_base = images + b * channels * in_height * in_width;
     T *output_base = output + b * channels * out_height * out_width;
-#pragma omp parallel for schedule(runtime)
-    for (index_t y = 0; y < out_height; ++y) {
-      const T
-          *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
-      const T
-          *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
-      const float ys_lerp = ys[y].lerp;
-
-      for (index_t x = 0; x < out_width; ++x) {
-        const float xs_lerp = xs[x].lerp;
-        const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
-        const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
-        const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
-        const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
-
-        T *output_ptr = output_base + (y * out_width + x) * channels;
-        for (index_t c = 0; c < channels; ++c) {
-          output_ptr[c] =
-              ComputeLerp(top_left[c], top_right[c], bottom_left[c],
-                          bottom_right[c], xs_lerp, ys_lerp);
+
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t y = start; y < end; y += step) {
+        const T
+            *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
+        const T
+            *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
+        const float ys_lerp = ys[y].lerp;
+
+        for (index_t x = 0; x < out_width; ++x) {
+          const float xs_lerp = xs[x].lerp;
+          const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
+          const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
+          const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
+          const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
+
+          T *output_ptr = output_base + (y * out_width + x) * channels;
+          for (index_t c = 0; c < channels; ++c) {
+            output_ptr[c] =
+                ComputeLerp(top_left[c], top_right[c], bottom_left[c],
+                            bottom_right[c], xs_lerp, ys_lerp);
+          }
         }
       }
-    }
+    }, 0, out_height, 1);
   }
 }
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class ResizeBilinearOp;
 
-template <typename T>
+template<typename T>
 class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit ResizeBilinearOp(OpConstructContext *context)
@@ -226,7 +238,8 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
     ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
     ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
 
-    ResizeImageNCHW(input_data,
+    ResizeImageNCHW(context,
+                    input_data,
                     batch,
                     in_height,
                     in_width,
@@ -301,7 +314,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
     ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
     ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
 
-    ResizeImageNHWC(input_data,
+    ResizeImageNHWC(context,
+                    input_data,
                     batch,
                     in_height,
                     in_width,
diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc
index 5cdbf07fa101881c4b1c5a4b66476a01199cacee..8840458f96f171ae0886b0181163b43c0093b02e 100644
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -26,8 +26,9 @@
 
 namespace mace {
 namespace ops {
-template <typename T>
-inline void ResizeImageNCHW(const T *images,
+template<typename T>
+inline void ResizeImageNCHW(const OpContext *context,
+                            const T *images,
                             const index_t batch_size,
                             const index_t in_height,
                             const index_t in_width,
@@ -38,36 +39,41 @@ inline void ResizeImageNCHW(const T *images,
                             const float width_scale,
                             bool align_corners,
                             T *output) {
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const T
-          *channel_input_ptr =
-          images + (b * channels + c) * in_height * in_width;
-      T *channel_output_ptr =
-          output + (b * channels + c) * out_height * out_width;
-      for (index_t y = 0; y < out_height; ++y) {
-        const index_t in_y = std::min(
-            (align_corners) ? static_cast<index_t>(roundf(y * height_scale))
-                            : static_cast<index_t>(floorf(y * height_scale)),
-            in_height - 1);
-        for (int x = 0; x < out_width; ++x) {
-          const index_t in_x = std::min(
-              (align_corners) ? static_cast<index_t>(roundf(x * width_scale))
-                              : static_cast<index_t>(floorf(x * width_scale)),
-              in_width - 1);
-          channel_output_ptr[y * out_width + x] =
-              channel_input_ptr[in_y * in_width + in_x];
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                            index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      for (index_t c = start1; c < end1; c += step1) {
+        const T
+            *channel_input_ptr =
+            images + (b * channels + c) * in_height * in_width;
+        T *channel_output_ptr =
+            output + (b * channels + c) * out_height * out_width;
+        for (index_t y = 0; y < out_height; ++y) {
+          const index_t in_y = std::min(
+              (align_corners) ? static_cast<index_t>(roundf(y * height_scale))
+                              : static_cast<index_t>(floorf(y * height_scale)),
+              in_height - 1);
+          for (int x = 0; x < out_width; ++x) {
+            const index_t in_x = std::min(
+                (align_corners) ? static_cast<index_t>(roundf(x * width_scale))
+                                : static_cast<index_t>(floorf(x * width_scale)),
+                in_width - 1);
+            channel_output_ptr[y * out_width + x] =
+                channel_input_ptr[in_y * in_width + in_x];
+          }
         }
       }
     }
-  }
+  }, 0, batch_size, 1, 0, channels, 1);
 }
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class ResizeNearestNeighborOp;
 
-template <typename T>
+template<typename T>
 class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit ResizeNearestNeighborOp(OpConstructContext *context)
@@ -116,7 +122,8 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
         resize_nearest_neighbor::CalculateResizeScale(in_width,
                                                       out_width,
                                                       align_corners_);
-    ResizeImageNCHW(input_data,
+    ResizeImageNCHW(context,
+                    input_data,
                     batch,
                     in_height,
                     in_width,
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
index f990912d0ce1f02ea65ab95d2334cf411aee2750..ac7ca64a9a700412a19a9600afaccdc2e56d81a8 100644
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -66,7 +66,6 @@ class SliceOp<DeviceType::CPU, T> : public Operation {
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>();
 
-#pragma omp parallel for schedule(runtime)
     for (index_t i = 0; i < frames; ++i) {
       const T *input_base =
           input_data + i * input_dim + offset;
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 427a29eb850c3a5577c4fd57a5b49e401e255b51..0eda5bf3ccee4973d9d9997ebdaac7fa5293ffa3 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -55,6 +55,9 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
     const float *input_data = input->data<float>();
     float *output_data = output->mutable_data<float>();
 
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     // softmax for nchw image
     if (input->dim_size() == 4) {
       const index_t batch = input->dim(0);
@@ -63,46 +66,47 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
       const index_t batch_size = class_count * class_size;
 
       for (index_t b = 0; b < batch; ++b) {
-#pragma omp parallel for schedule(runtime)
-        for (index_t k = 0; k < class_size; ++k) {
-          const float *input_ptr = input_data + b * batch_size + k;
-          float *output_ptr = output_data + b * batch_size + k;
-
-          float max_val = std::numeric_limits<float>::lowest();
-          index_t channel_offset = 0;
-          for (index_t c = 0; c < class_count; ++c) {
-            float data = input_ptr[channel_offset];
-            if (data > max_val) {
-              max_val = data;
-            }
-            channel_offset += class_size;
-          }
-
-          channel_offset = 0;
-          float sum = 0;
-          for (index_t c = 0; c < class_count; ++c) {
-            float exp_value = ::exp(input_ptr[channel_offset] - max_val);
-            sum += exp_value;
-            output_ptr[channel_offset] = exp_value;
-            channel_offset += class_size;
-          }
+        thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+          for (index_t k = start; k < end; k += step) {
+            const float *input_ptr = input_data + b * batch_size + k;
+            float *output_ptr = output_data + b * batch_size + k;
 
-          sum = std::max(sum, std::numeric_limits<float>::min());
-          channel_offset = 0;
-          if (use_log_) {
+            float max_val = std::numeric_limits<float>::lowest();
+            index_t channel_offset = 0;
             for (index_t c = 0; c < class_count; ++c) {
-              output_ptr[channel_offset] /= sum;
-              output_ptr[channel_offset] =
-                  std::log(output_ptr[channel_offset]);
+              float data = input_ptr[channel_offset];
+              if (data > max_val) {
+                max_val = data;
+              }
               channel_offset += class_size;
             }
-          } else {
+
+            channel_offset = 0;
+            float sum = 0;
             for (index_t c = 0; c < class_count; ++c) {
-              output_ptr[channel_offset] /= sum;
+              float exp_value = ::exp(input_ptr[channel_offset] - max_val);
+              sum += exp_value;
+              output_ptr[channel_offset] = exp_value;
               channel_offset += class_size;
             }
-          }
-        }  // k
+
+            sum = std::max(sum, std::numeric_limits<float>::min());
+            channel_offset = 0;
+            if (use_log_) {
+              for (index_t c = 0; c < class_count; ++c) {
+                output_ptr[channel_offset] /= sum;
+                output_ptr[channel_offset] =
+                    std::log(output_ptr[channel_offset]);
+                channel_offset += class_size;
+              }
+            } else {
+              for (index_t c = 0; c < class_count; ++c) {
+                output_ptr[channel_offset] /= sum;
+                channel_offset += class_size;
+              }
+            }
+          }  // k
+        }, 0, class_size, 1);
       }  // b
     } else if (input->dim_size() == 2 || input->dim_size() == 3) {
       // normal 2d softmax and 3d softmax (dim(0) is batch)
@@ -115,35 +119,36 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
         class_size = input->dim(0) * input->dim(1);
         class_count = input->dim(2);
       }
-#pragma omp parallel for schedule(runtime)
-      for (index_t k = 0; k < class_size; ++k) {
-        const float *input_ptr = input_data + k * class_count;
-        float *output_ptr = output_data + k * class_count;
-
-        float max_val = std::numeric_limits<float>::lowest();
-        for (index_t c = 0; c < class_count; ++c) {
-          max_val = std::max(max_val, input_ptr[c]);
-        }
-
-        float sum = 0;
-        for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = std::exp(input_ptr[c] - max_val);
-          sum += exp_value;
-          output_ptr[c] = exp_value;
-        }
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        for (index_t k = start; k < end; k += step) {
+          const float *input_ptr = input_data + k * class_count;
+          float *output_ptr = output_data + k * class_count;
 
-        sum = std::max(sum, std::numeric_limits<float>::min());
-        if (use_log_) {
+          float max_val = std::numeric_limits<float>::lowest();
           for (index_t c = 0; c < class_count; ++c) {
-            output_ptr[c] /=  sum;
-            output_ptr[c] = std::log(output_ptr[c]);
+            max_val = std::max(max_val, input_ptr[c]);
           }
-        } else {
+
+          float sum = 0;
           for (index_t c = 0; c < class_count; ++c) {
-            output_ptr[c] /=  sum;
+            float exp_value = std::exp(input_ptr[c] - max_val);
+            sum += exp_value;
+            output_ptr[c] = exp_value;
+          }
+
+          sum = std::max(sum, std::numeric_limits<float>::min());
+          if (use_log_) {
+            for (index_t c = 0; c < class_count; ++c) {
+              output_ptr[c] /=  sum;
+              output_ptr[c] = std::log(output_ptr[c]);
+            }
+          } else {
+            for (index_t c = 0; c < class_count; ++c) {
+              output_ptr[c] /=  sum;
+            }
           }
         }
-      }
+      }, 0, class_size, 1);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -202,30 +207,35 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
     float input_scale = input->scale();
     uint8_t *output_data = output->mutable_data<uint8_t>();
 
+    utils::ThreadPool
+        &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
     // If depth is short, do it using float32. Float computation should not
     // be here, but as long as it is on CPU, it is fine.
     if (depth < 32) {
-#pragma omp parallel for schedule(runtime)
-      for (index_t b = 0; b < batch; ++b) {
-        const uint8_t *input_ptr = input_data + b * depth;
-        uint8_t *output_ptr = output_data + b * depth;
+      thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+        for (index_t b = start; b < end; b += step) {
+          const uint8_t *input_ptr = input_data + b * depth;
+          uint8_t *output_ptr = output_data + b * depth;
 
-        uint8_t max_value = FindMax(input_ptr, depth);
-        float sum = 0;
-        std::vector<float> depth_cache(depth);
-        for (index_t d = 0; d < depth; ++d) {
-          float exp_value = ::exp((static_cast<int>(input_ptr[d]) - max_value)
-                                      * input_scale);
-          sum += exp_value;
-          depth_cache[d] = exp_value;
-        }
+          uint8_t max_value = FindMax(input_ptr, depth);
+          float sum = 0;
+          std::vector<float> depth_cache(depth);
+          for (index_t d = 0; d < depth; ++d) {
+            float exp_value = ::exp((static_cast<int>(input_ptr[d]) - max_value)
+                                        * input_scale);
+            sum += exp_value;
+            depth_cache[d] = exp_value;
+          }
 
-        sum = std::max(sum, std::numeric_limits<float>::min());
-        for (index_t d = 0; d < depth; ++d) {
-          double output_f = depth_cache[d] / sum;
-          output_ptr[d] = static_cast<uint8_t>(output_f * 255);
+          sum = std::max(sum, std::numeric_limits<float>::min());
+          for (index_t d = 0; d < depth; ++d) {
+            double output_f = depth_cache[d] / sum;
+            output_ptr[d] = static_cast<uint8_t>(output_f * 255);
+          }
         }
-      }
+      }, 0, batch, 1);
+
       return MaceStatus::MACE_SUCCESS;
     }
 
@@ -234,19 +244,19 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
         (1ll << 31) - 1.0));
     int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q;
 
-#pragma omp parallel for schedule(runtime)
-    for (index_t b = 0; b < batch; ++b) {
-      const uint8_t *input_ptr = input_data + b * depth;
-      uint8_t *output_ptr = output_data + b * depth;
+    thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
+      for (index_t b = start; b < end; b += step) {
+        const uint8_t *input_ptr = input_data + b * depth;
+        uint8_t *output_ptr = output_data + b * depth;
 
-      FixPointSumExp sum = FixPointSumExp::Zero();
-      uint8_t max_value = FindMax(input_ptr, depth);
-      index_t d = 0;
+        FixPointSumExp sum = FixPointSumExp::Zero();
+        uint8_t max_value = FindMax(input_ptr, depth);
+        index_t d = 0;
 
-      // Neon optimization is not useful so far as we benchmark.
-      // Enable it when we find a case that proves it useful.
+        // Neon optimization is not useful so far as we benchmark.
+        // Enable it when we find a case that proves it useful.
 #if 0 && defined(MACE_ENABLE_NEON)
-      using FixPointInputDeltaInt32x4 = gemmlowp::FixedPoint<int32x4_t,
+        using FixPointInputDeltaInt32x4 = gemmlowp::FixedPoint<int32x4_t,
                                                          kInputDeltaIntBits>;
       using FixPointSumExpInt32x4 = gemmlowp::FixedPoint<int32x4_t,
                                                          kSumExpIntBits>;
@@ -305,33 +315,33 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
           vpadd_s32(vsum_reduced_2_s32, vsum_reduced_2_s32);
       sum = FixPointSumExp::FromRaw(vget_lane_s32(vsum_reduced_1_s32, 0));
 #endif
-      for (; d < depth; ++d) {
-        int32_t input_delta = static_cast<int32_t>(input_ptr[d]) - max_value;
-        if (input_delta >= input_delta_limit) {
-          int32_t scaled_input_delta_q = scale_q * input_delta;
-          FixPointInputDelta scaled_input_delta_fp =
-              FixPointInputDelta::FromRaw(scaled_input_delta_q);
-          sum = sum + gemmlowp::Rescale<kSumExpIntBits>(
-              exp_on_negative_values(scaled_input_delta_fp));
+        for (; d < depth; ++d) {
+          int32_t input_delta = static_cast<int32_t>(input_ptr[d]) - max_value;
+          if (input_delta >= input_delta_limit) {
+            int32_t scaled_input_delta_q = scale_q * input_delta;
+            FixPointInputDelta scaled_input_delta_fp =
+                FixPointInputDelta::FromRaw(scaled_input_delta_q);
+            sum = sum + gemmlowp::Rescale<kSumExpIntBits>(
+                exp_on_negative_values(scaled_input_delta_fp));
+          }
         }
-      }
 
-      int32_t sum_q = sum.raw();
-      int left_zero_count =
-          __builtin_clz(static_cast<uint32_t>(sum_q));
-      int tail_count = kSumExpIntBits - left_zero_count;
-      int32_t fractional_q0 = static_cast<int32_t>(
-          (static_cast<uint32_t>(sum_q) << left_zero_count) -
-              (static_cast<uint32_t>(1) << 31));
-      FixPoint0 recip_sum_q0 = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-          FixPoint0::FromRaw(fractional_q0));
+        int32_t sum_q = sum.raw();
+        int left_zero_count =
+            __builtin_clz(static_cast<uint32_t>(sum_q));
+        int tail_count = kSumExpIntBits - left_zero_count;
+        int32_t fractional_q0 = static_cast<int32_t>(
+            (static_cast<uint32_t>(sum_q) << left_zero_count) -
+                (static_cast<uint32_t>(1) << 31));
+        FixPoint0 recip_sum_q0 = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+            FixPoint0::FromRaw(fractional_q0));
 
-      d = 0;
+        d = 0;
 
-      // Neon optimization is not useful so far as we benchmark.
-      // Enable it when we find a case that proves it useful.
+        // Neon optimization is not useful so far as we benchmark.
+        // Enable it when we find a case that proves it useful.
 #if 0 && defined(MACE_ENABLE_NEON)
-      FixPoint0Int32x4 vrecip_sum_q0_s32_fp =
+        FixPoint0Int32x4 vrecip_sum_q0_s32_fp =
           FixPoint0Int32x4::FromScalarRaw(recip_sum_q0.raw());
       int16x8_t vinput_delta_limit_s16 = vdupq_n_s16(input_delta_limit);
       for (; d <= depth - 8; d += 8) {
@@ -371,21 +381,23 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
         vst1_u8(output_ptr + d, voutput_u8);
       }
 #endif
-      for (; d < depth; ++d) {
-        int32_t input_delta = static_cast<int32_t>(input_ptr[d]) - max_value;
-        if (input_delta >= input_delta_limit) {
-          int32_t scaled_input_delta_q = scale_q * input_delta;
-          FixPointInputDelta scaled_input_delta_fp =
-              FixPointInputDelta::FromRaw(scaled_input_delta_q);
-
-          FixPoint0 exp = exp_on_negative_values(scaled_input_delta_fp);
-          int32_t output_data = gemmlowp::RoundingDivideByPOT(
-              (recip_sum_q0 * exp).raw(), tail_count + 31 - 8);
-
-          output_ptr[d] = std::max(std::min(output_data, 255), 0);
+        for (; d < depth; ++d) {
+          int32_t input_delta = static_cast<int32_t>(input_ptr[d]) - max_value;
+          if (input_delta >= input_delta_limit) {
+            int32_t scaled_input_delta_q = scale_q * input_delta;
+            FixPointInputDelta scaled_input_delta_fp =
+                FixPointInputDelta::FromRaw(scaled_input_delta_q);
+
+            FixPoint0 exp = exp_on_negative_values(scaled_input_delta_fp);
+            int32_t output_data = gemmlowp::RoundingDivideByPOT(
+                (recip_sum_q0 * exp).raw(), tail_count + 31 - 8);
+
+            output_ptr[d] = std::max(std::min(output_data, 255), 0);
+          }
         }
       }
-    }
+    }, 0, batch, 1);
+
     return MaceStatus::MACE_SUCCESS;
   }
 
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index ece9b6f61dd25e0fe4c6d2f5aff1aeea4ed55302..b239193c2641af400fb5c67f25be2efff8c04859 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -130,7 +130,6 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
         std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
 
     // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t c = 0; c < channels; ++c) {
       for (index_t block_h = 0; block_h < out_height;
            block_h += block_h_size) {
@@ -239,7 +238,6 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
     index_t out_width = batch_tensor->dim(2);
     index_t channels = batch_tensor->dim(3);
 
-#pragma omp parallel for schedule(runtime)
     for (index_t b = 0; b < out_batches; ++b) {
       const index_t in_b = b % in_batches;
       const index_t tile_index = b / in_batches;
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 4e40227c5b5857d065195d509bcafe55fbef1c59..918ae678b5cb09c2f6c8f2a584f3b5fbb5d47997 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -58,7 +58,6 @@ class SpaceToDepthOp : public Operation {
     const T *input_ptr = input->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
-#pragma omp parallel for
     for (index_t b = 0; b < batch_size; ++b) {
       for (index_t d = 0; d < input_depth; ++d) {
         for (index_t h = 0; h < input_height; ++h) {
diff --git a/mace/ops/splice.cc b/mace/ops/splice.cc
index 8517b6b831c80397086e0598f8803aeff0be81ce..6d47732904820f6a83712cb1cb309e424b459615 100644
--- a/mace/ops/splice.cc
+++ b/mace/ops/splice.cc
@@ -32,10 +32,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SpliceOp;
 
-template <typename T>
+template<typename T>
 class SpliceOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit SpliceOp(OpConstructContext *context)
@@ -85,7 +85,6 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>();
 
-#pragma omp parallel for collapse(3) schedule(runtime)
     for (int b = 0; b < batch; ++b) {
       for (index_t i = 0; i < out_chunk; ++i) {
         for (index_t c = 0; c < num_splice; ++c) {
@@ -102,7 +101,6 @@ class SpliceOp<DeviceType::CPU, T> : public Operation {
     if (const_dim_ > 0) {
       const index_t output_offset = output_dim - const_dim_;
       const index_t input_offset = dim;
-#pragma omp parallel for collapse(2) schedule(runtime)
       for (int b = 0; b < batch; ++b) {
         for (index_t i = 0; i < out_chunk; ++i) {
           T *output_base = output_data +  + b * output_stride + i * output_dim;
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 7c920d4c115f9650973ab62a2c79d29b677faf83..e1523a06253c2a38c2451046e4daa1b0c51d2713 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -24,10 +24,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SplitOp;
 
-template <typename T>
+template<typename T>
 class SplitOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit SplitOp(OpConstructContext *context)
@@ -70,19 +70,18 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
                                                output_shape.end(),
                                                1,
                                                std::multiplies<index_t>());
-    for (size_t i= 0; i < outputs_count; ++i) {
+    for (size_t i = 0; i < outputs_count; ++i) {
       MACE_RETURN_IF_ERROR(output_list[i]->Resize(output_shape));
       output_ptrs[i] = output_list[i]->mutable_data<T>();
     }
     const T *input_ptr = input->data<T>();
 
-#pragma omp parallel for
     for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
       index_t input_idx = outer_idx * input_channels * inner_size;
       index_t output_idx = outer_idx * output_channels * inner_size;
       for (size_t i = 0; i < outputs_count; ++i) {
         if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
-          memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx,
+          memcpy(output_ptrs[i] + output_idx, input_ptr + input_idx,
                  output_channels * inner_size * sizeof(T));
         } else {
           for (index_t k = 0; k < output_channels * inner_size; ++k) {
@@ -100,7 +99,6 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
   bool checked_;
 };
 
-
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class SplitOp<DeviceType::GPU, T> : public Operation {
@@ -130,7 +128,6 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterSplit(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Split", SplitOp,
                    DeviceType::CPU, float);
@@ -150,15 +147,15 @@ void RegisterSplit(OpRegistryBase *op_registry) {
               [](OpConstructContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                     *op, "axis", 3);
                 if (axis != 3 || op->output_shape(0).dims_size() != 4 ||
                     (op->output_shape(0).dims()[3] % 4 != 0)) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index b937b259322615abcbb929e4c17c0f41e3844167..d58191c4d0bd6b2d992af9495c56b1a7dca4bc44 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -65,7 +65,7 @@ class SqrDiffMeanOp : public Operation {
 
     const index_t img_size = input0->dim(2) * input0->dim(3);
     const index_t bc = input0->dim(0) * input0->dim(1);
-#pragma omp parallel for schedule(runtime)
+
     for (int i = 0; i < bc; ++i) {
       for (int j = 0; j < img_size; ++j) {
         T diff = input_ptr0[i * img_size + j] - input_ptr1[i];
diff --git a/mace/ops/sum_group.cc b/mace/ops/sum_group.cc
index 21c83b68f98b791a9a061fb1226b6b86edfceba6..0efdfe2a764d81ab35d035bf93f7ffeeb6e66174 100644
--- a/mace/ops/sum_group.cc
+++ b/mace/ops/sum_group.cc
@@ -81,7 +81,6 @@ class SumGroupOp<DeviceType::CPU, T> : public Operation {
         << "size value over-ranged:" << cur_index << "<=" << input_dim;
     }
 
-#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t i = 0; i < bh; ++i) {
       for (index_t j = 0; j < output_dim; ++j) {
         int start_col = sum_indexes[j].first;
diff --git a/mace/ops/target_rms_norm.cc b/mace/ops/target_rms_norm.cc
index 7b769fe712c35cc39cf282731f2a5d64d21d8695..80d42a1d0579fe563ea3fec01655de98e610dd08 100644
--- a/mace/ops/target_rms_norm.cc
+++ b/mace/ops/target_rms_norm.cc
@@ -91,7 +91,6 @@ class TargetRMSNormOp<DeviceType::CPU, T> : public Operation {
     const float *input_data = input->data<float>();
     float *output_data = output->mutable_data<float>();
 
-#pragma omp parallel for schedule(runtime)
     for (index_t i = 0; i < bh; ++i) {
       float scale = SquareSum(input_data + i * dim, dim);
       scale = static_cast<float>(1.0 / std::sqrt(scale / d_scale));
diff --git a/mace/ops/thread_pool_benchmark.cc b/mace/ops/thread_pool_benchmark.cc
index 1fd14713d87157a13d98e30fe1a845fe189d8078..f800929809725ac73cd63fb082831e0dd7a38dd5 100644
--- a/mace/ops/thread_pool_benchmark.cc
+++ b/mace/ops/thread_pool_benchmark.cc
@@ -29,16 +29,16 @@ namespace test {
 
 namespace {
 
-const size_t kMaxSize = 100000000;
-const size_t image_size = 56 * 56;
-std::vector<float>  output_data(kMaxSize), bias_data(kMaxSize);
+const index_t kMaxSize = 100000000;
+const index_t image_size = 56 * 56;
+std::vector<float> output_data(kMaxSize), bias_data(kMaxSize);
 
 void OpenMPBenchmark1D(int iters, int size) {
   while (iters--) {
     const int b = 0;
 #pragma omp parallel for schedule(runtime)
     for (int c = 0; c < size; ++c) {
-      for (size_t i = 0; i < image_size; ++i) {
+      for (index_t i = 0; i < image_size; ++i) {
         output_data[(b * size + c) * image_size + i] += bias_data[c];
       }
     }
@@ -52,11 +52,10 @@ void ThreadPoolBenchmark1D(int iters, int size) {
   mace::testing::StartTiming();
 
   while (iters--) {
-    const int b = 0;  // 'const' keyword affects performance
-    int batch_size = size * image_size;
-    thread_pool.Compute1D([&](size_t start0, size_t end0, size_t step0) {
-      for (size_t c = start0; c < end0; c += step0) {
-        for (size_t i = 0; i < image_size; ++i) {
+    const int b = 0;
+    thread_pool.Compute1D([=](index_t start0, index_t end0, index_t step0) {
+      for (index_t c = start0; c < end0; c += step0) {
+        for (index_t i = 0; i < image_size; ++i) {
           output_data[(b * size + c) * image_size + i] += bias_data[c];
         }
       }
@@ -67,14 +66,13 @@ void ThreadPoolBenchmark1D(int iters, int size) {
 void OpenMPBenchmark2D(int iters, int size0, int size1) {
   while (iters--) {
 #pragma omp parallel for collapse(2) schedule(runtime)
-      for (int b = 0; b < size0; ++b) {
-        for (int c = 0; c < size1; ++c) {
-          for (size_t i = 0; i < image_size; ++i) {
-            // it seems like OpenMP optimize the following mac
-            output_data[(b * size1 + c) * image_size + i] += bias_data[c];
-          }
+    for (int b = 0; b < size0; ++b) {
+      for (int c = 0; c < size1; ++c) {
+        for (index_t i = 0; i < image_size; ++i) {
+          output_data[(b * size1 + c) * image_size + i] += bias_data[c];
         }
       }
+    }
   }
 }
 
@@ -85,11 +83,11 @@ void ThreadPoolBenchmark2D(int iters, int size0, int size1) {
   mace::testing::StartTiming();
 
   while (iters--) {
-    thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0,
-                              size_t start1, size_t end1, size_t step1) {
-      for (size_t b = start0; b < end0; b += step0) {
-        for (size_t c = start1; c < end1; c += step1) {
-          for (size_t i = 0; i < image_size; ++i) {
+    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
+      for (index_t b = start0; b < end0; b += step0) {
+        for (index_t c = start1; c < end1; c += step1) {
+          for (index_t i = 0; i < image_size; ++i) {
             output_data[(b * size1 + c) * image_size + i] += bias_data[c];
           }
         }
diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc
index 678f3ee642f210083904c189fc4752dcd8c5bd4e..6c6993e065a9dbf1f0a0bf0e336ea32598a9989b 100644
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -26,10 +26,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class TransposeOp;
 
-template <DeviceType D>
+template<DeviceType D>
 class TransposeOp<D, float> : public Operation {
  public:
   explicit TransposeOp(OpConstructContext *context)
@@ -55,7 +55,8 @@ class TransposeOp<D, float> : public Operation {
     const float *input_data = input->data<float>();
     float *output_data = output->mutable_data<float>();
 
-    return Transpose(input_data, input->shape(), dims_, output_data);
+    return Transpose(&context->device()->cpu_runtime()->thread_pool(),
+                     input_data, input->shape(), dims_, output_data);
   }
 
  private:
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 8cc251132d9d2ee26ecf70b2684e7eee25f50f15..fd39fdba6c501b6f1aa4eb6cb7980fa5158012ca 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -299,11 +299,9 @@ class MACE_API MaceEngineConfig {
   /// \param status MACE_SUCCESS for successful, or it can't reliabley
   /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
   /// suggested to use AFFINITY_NONE to use all cores.
-  /// \param use_gemmlowp use gemmlowp for cpu quantized inference
   /// \return MaceStatus::MACE_SUCCESS for success, other for failed.
   MaceStatus SetCPUThreadPolicy(int num_threads_hint,
-                                CPUAffinityPolicy policy,
-                                bool use_gemmlowp = false);
+                                CPUAffinityPolicy policy);
 
  private:
   class Impl;
diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h
index 2257b2162ca6d53e81fd29367594bf860ff115ec..9cc1402f7558c9e5d0d1116eaef2fb161adda194 100644
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -163,7 +163,7 @@ void CheckOutputs(const NetDef &net_def,
     std::unique_ptr<Tensor> tmp_tensor(
         new Tensor(allocator.get(),
                    DataTypeToEnum<float>::v()));
-    auto output_shape = output.second.shape();
+    auto &output_shape = output.second.shape();
     const int64_t data_size = std::accumulate(output_shape.begin(),
                                               output_shape.end(), 1,
                                               std::multiplies<float>());
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index cc77b6b303a42a32ae9aab0d1ee9033c82dcba62..7fc0690df25c3f2dc094cc4f36109b3eba392e23 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -167,8 +167,7 @@ bool RunModel(const std::string &model_name,
   MaceEngineConfig config(device_type);
   status = config.SetCPUThreadPolicy(
           FLAGS_omp_num_threads,
-          static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
-          true);
+          static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
   if (status != MaceStatus::MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
deleted file mode 100644
index 30595046cabffc6d33a57803dcf59d638962a6d4..0000000000000000000000000000000000000000
--- a/mace/utils/quantize.h
+++ /dev/null
@@ -1,301 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_UTILS_QUANTIZE_H_
-#define MACE_UTILS_QUANTIZE_H_
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif  // MACE_ENABLE_NEON
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-template<typename T>
-inline void AdjustRange(const float in_min_data,
-                        const float in_max_data,
-                        const bool non_zero,
-                        float *scale,
-                        int32_t *zero_point) {
-  // re-range to make range include zero float and
-  // make zero float as integer u8
-  const T quantized_min = std::numeric_limits<T>::lowest();
-  const T quantized_max = std::numeric_limits<T>::max();
-  if (quantized_min < 0) {
-    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
-  }
-
-  float out_max = std::max(0.f, in_max_data);
-  float out_min = std::min(0.f, in_min_data);
-  // make in_min_data quantize as greater than 1
-  if (non_zero) {
-    out_min = std::min(out_min,
-                       in_min_data - (out_max - in_min_data)
-                           / (quantized_max - quantized_min - 1));
-  }
-
-  *scale = (out_max - out_min) / (quantized_max - quantized_min);
-  const float kEps = 1e-6;
-  if (out_min < -kEps && out_max > kEps) {
-    float quantized_zero = -out_min / *scale;
-    int32_t
-        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
-    *zero_point = quantized_zero_near_int;
-    if (fabs(quantized_zero - quantized_zero_near_int) > kEps && non_zero) {
-      *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
-    }
-  } else if (out_min > -kEps) {
-    *zero_point = quantized_min;
-  } else {
-    *zero_point = quantized_max;
-  }
-}
-
-template<typename T>
-inline T Saturate(float value) {
-  int rounded_value = static_cast<int>(value);
-  if (rounded_value <= std::numeric_limits<T>::lowest()) {
-    return std::numeric_limits<T>::lowest();
-  } else if (rounded_value >= std::numeric_limits<T>::max()) {
-    return std::numeric_limits<T>::max();
-  } else {
-    return static_cast<T>(rounded_value);
-  }
-}
-
-inline void FindMinMax(const float *input,
-                       const index_t size,
-                       float *min_val, float *max_val) {
-  float max_v = std::numeric_limits<float>::lowest();
-  float min_v = std::numeric_limits<float>::max();
-  for (index_t i = 0; i < size; ++i) {
-    max_v = std::max(max_v, input[i]);
-    min_v = std::min(min_v, input[i]);
-  }
-  *min_val = min_v;
-  *max_val = max_v;
-}
-
-template<typename T>
-inline void QuantizeWithScaleAndZeropoint(const float *input,
-                                          const index_t size,
-                                          float scale,
-                                          int32_t zero_point,
-                                          T *output) {
-  float recip_scale = 1 / scale;
-#pragma omp parallel for schedule(runtime)
-  for (int i = 0; i < size; ++i) {
-    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
-  }
-}
-
-template<typename T>
-inline void Quantize(const float *input,
-                     const index_t size,
-                     bool non_zero,
-                     T *output,
-                     float *scale,
-                     int32_t *zero_point) {
-  float in_min_data;
-  float in_max_data;
-  FindMinMax(input, size, &in_min_data, &in_max_data);
-
-  AdjustRange<T>(in_min_data, in_max_data, non_zero,
-                 scale, zero_point);
-
-  QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
-}
-
-template<typename T>
-inline void Quantize(const Tensor &input,
-                     Tensor *output,
-                     float *min_out,
-                     float *max_out) {
-  MACE_CHECK(input.size() != 0);
-  Tensor::MappingGuard input_guard(&input);
-  Tensor::MappingGuard output_guard(output);
-  auto *input_data = input.data<float>();
-  auto *output_data = output->mutable_data<T>();
-  float scale;
-  int32_t zero_point;
-
-  Quantize(input_data, input.size(), false, output_data, &scale, &zero_point);
-
-  *min_out = scale * (std::numeric_limits<T>::lowest() - zero_point);
-  *max_out = scale * (std::numeric_limits<T>::max() - zero_point);
-}
-
-template<typename T>
-inline void Dequantize(const T *input,
-                       const index_t size,
-                       const float scale,
-                       const int32_t zero_point,
-                       float *output) {
-#pragma omp parallel for schedule(runtime)
-  for (int i = 0; i < size; ++i) {
-    output[i] = scale * (input[i] - zero_point);
-  }
-}
-
-#if defined(MACE_ENABLE_NEON)
-template<>
-inline void QuantizeWithScaleAndZeropoint<uint8_t>(const float *input,
-                                                   const index_t size,
-                                                   float scale,
-                                                   int32_t zero_point,
-                                                   uint8_t *output) {
-  const float32x4_t vround = vdupq_n_f32(0.5);
-  const float32x4_t
-      vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point)));
-  const float recip_scale = 1.f / scale;
-  const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale);
-  const index_t block_count = size / 16;
-
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < block_count; ++i) {
-    float32x4_t vi0 = vld1q_f32(input + i * 16);
-    float32x4_t vi1 = vld1q_f32(input + i * 16 + 4);
-    float32x4_t vi2 = vld1q_f32(input + i * 16 + 8);
-    float32x4_t vi3 = vld1q_f32(input + i * 16 + 12);
-
-    int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale));
-    int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale));
-    int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale));
-    int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale));
-
-    uint8x8_t vo0_u8 =
-        vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32)));
-    uint8x8_t vo1_u8 =
-        vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32)));
-    uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8);
-
-    vst1q_u8(output + i * 16, vo);
-  }
-
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = block_count * 16; i < size; ++i) {
-    output[i] = Saturate<uint8_t>(roundf(zero_point + recip_scale * input[i]));
-  }
-}
-
-template<>
-inline void Dequantize<int32_t>(const int32_t *input,
-                                const index_t size,
-                                const float scale,
-                                const int32_t zero_point,
-                                float *output) {
-  const index_t block_count = size / 4;
-  const int32x4_t vzero = vdupq_n_s32(zero_point);
-  const float32x4_t vscale = vdupq_n_f32(scale);
-
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < block_count; ++i) {
-    int32x4_t vi = vld1q_s32(input + i * 4);
-    float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero)));
-    vst1q_f32(output + i * 4, vo);
-  }
-  for (index_t i = block_count * 4; i < size; ++i) {
-    output[i] = scale * (input[i] - zero_point);
-  }
-}
-
-template<>
-inline void Dequantize<uint8_t>(const uint8_t *input,
-                                const index_t size,
-                                const float scale,
-                                const int32_t zero_point,
-                                float *output) {
-  const index_t block_count = size / 16;
-  const int32x4_t vzero = vdupq_n_s32(zero_point);
-  const float32x4_t vscale = vdupq_n_f32(scale);
-
-#pragma omp parallel for schedule(runtime)
-  for (index_t i = 0; i < block_count; ++i) {
-    uint8x16_t vi = vld1q_u8(input + i * 16);
-    float32x4x4_t vo = {
-        vmulq_f32(vscale,
-                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
-                      vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
-        vmulq_f32(vscale,
-                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
-                      vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
-        vmulq_f32(vscale,
-                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
-                      vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
-        vmulq_f32(vscale,
-                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
-                      vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
-    };
-    vst1q_f32(output + i * 16, vo.val[0]);
-    vst1q_f32(output + i * 16 + 4, vo.val[1]);
-    vst1q_f32(output + i * 16 + 8, vo.val[2]);
-    vst1q_f32(output + i * 16 + 12, vo.val[3]);
-  }
-  for (index_t i = block_count * 16; i < size; ++i) {
-    output[i] = scale * (input[i] - zero_point);
-  }
-}
-#endif  // MACE_ENABLE_NEON
-
-template<typename T>
-inline void DeQuantize(const Tensor &input,
-                       const float min_in,
-                       const float max_in,
-                       Tensor *output) {
-  MACE_CHECK(input.size() != 0);
-  Tensor::MappingGuard input_guard(&input);
-  Tensor::MappingGuard output_guard(output);
-  auto *input_data = input.data<T>();
-  auto *output_data = output->mutable_data<float>();
-  float scale;
-  int32_t zero_point;
-
-  AdjustRange<T>(min_in, max_in, false, &scale, &zero_point);
-
-  Dequantize(input_data, input.size(), scale, zero_point, output_data);
-}
-
-inline void QuantizeMultiplier(double multiplier,
-                               int32_t *output_multiplier,
-                               int32_t *shift) {
-  const double q = std::frexp(multiplier, shift);
-  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
-  if (qint == (1ll << 31)) {
-    qint /= 2;
-    ++*shift;
-  }
-  *output_multiplier = static_cast<int32_t>(qint);
-  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
-}
-
-inline void GetOutputMultiplierAndShift(
-    const float lhs_scale, const float rhs_scale, const float output_scale,
-    int32_t *quantized_multiplier, int *right_shift) {
-  float real_multiplier = lhs_scale * rhs_scale / output_scale;
-  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
-
-  int exponent;
-  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
-  *right_shift = -exponent;
-  MACE_CHECK(*right_shift >= 0);
-}
-
-}  // namespace mace
-
-#endif  // MACE_UTILS_QUANTIZE_H_
diff --git a/mace/utils/thread_pool.cc b/mace/utils/thread_pool.cc
index 92c128a79d5baa43e3e5426c984bf5b14ca0e405..8d7d98c8535065b9ac1a9a55325f441898bec4a1 100644
--- a/mace/utils/thread_pool.cc
+++ b/mace/utils/thread_pool.cc
@@ -163,19 +163,19 @@ void ThreadPool::Init() {
   count_down_latch_.Wait();
 }
 
-void ThreadPool::Run(const std::function<void(size_t)> &func,
-                     size_t iterations) {
+void ThreadPool::Run(const std::function<void(const int64_t)> &func,
+                     const int64_t iterations) {
   const size_t thread_count = threads_.size();
-  const size_t iters_per_thread = iterations / thread_count;
-  const size_t remainder = iterations % thread_count;
-  size_t iters_offset = 0;
+  const int64_t iters_per_thread = iterations / thread_count;
+  const int64_t remainder = iterations % thread_count;
+  int64_t iters_offset = 0;
 
   std::unique_lock<std::mutex> run_lock(run_mutex_);
 
   for (size_t i = 0; i < thread_count; ++i) {
-    size_t count = iters_per_thread + (i < remainder);
+    int64_t count = iters_per_thread + (static_cast<int64_t>(i) < remainder);
     thread_infos_[i].range_start = iters_offset;
-    size_t range_end = std::min(iterations, iters_offset + count);
+    int64_t range_end = std::min(iterations, iters_offset + count);
     thread_infos_[i].range_end = range_end;
     thread_infos_[i].range_len = range_end - iters_offset;
     thread_infos_[i].func = reinterpret_cast<uintptr_t>(&func);
@@ -263,10 +263,10 @@ void ThreadPool::ThreadLoop(size_t tid) {
 void ThreadPool::ThreadRun(size_t tid) {
   ThreadInfo &thread_info = thread_infos_[tid];
   uintptr_t func_ptr = thread_info.func;
-  const std::function<void(size_t)> *func =
-      reinterpret_cast<const std::function<void(size_t)> *>(func_ptr);
+  const std::function<void(int64_t)> *func =
+      reinterpret_cast<const std::function<void(int64_t)> *>(func_ptr);
   // do own work
-  size_t range_len;
+  int64_t range_len;
   while ((range_len = thread_info.range_len) > 0) {
     if (thread_info.range_len.compare_exchange_strong(range_len,
                                                       range_len - 1)) {
@@ -280,33 +280,33 @@ void ThreadPool::ThreadRun(size_t tid) {
        t = (t + 1) % thread_count) {
     ThreadInfo &other_thread_info = thread_infos_[t];
     uintptr_t other_func_ptr = other_thread_info.func;
-    const std::function<void(size_t)> *other_func =
-        reinterpret_cast<const std::function<void(size_t)> *>(
+    const std::function<void(int64_t)> *other_func =
+        reinterpret_cast<const std::function<void(int64_t)> *>(
             other_func_ptr);
     while ((range_len = other_thread_info.range_len) > 0) {
       if (other_thread_info.range_len.compare_exchange_strong(range_len,
                                                               range_len
                                                                   - 1)) {
-        size_t tail = other_thread_info.range_end--;
+        int64_t tail = other_thread_info.range_end--;
         other_func->operator()(tail - 1);
       }
     }
   }
 }
 
-void ThreadPool::Compute1D(const std::function<void(size_t,
-                                                    size_t,
-                                                    size_t)> &func,
-                           size_t start,
-                           size_t end,
-                           size_t step,
-                           size_t tile_size,
-                           int cost_per_item) {
+void ThreadPool::Compute1D(const std::function<void(int64_t,
+                                                    int64_t,
+                                                    int64_t)> &func,
+                           const int64_t start,
+                           const int64_t end,
+                           const int64_t step,
+                           int64_t tile_size,
+                           const int cost_per_item) {
   if (start >= end) {
     return;
   }
 
-  size_t items = 1 + (end - start - 1) / step;
+  int64_t items = 1 + (end - start - 1) / step;
   if (threads_.size() <= 1 || (cost_per_item >= 0
       && items * cost_per_item < kMaxCostUsingSingleThread)) {
     func(start, end, step);
@@ -314,39 +314,39 @@ void ThreadPool::Compute1D(const std::function<void(size_t,
   }
 
   if (tile_size == 0) {
-    tile_size = std::max(static_cast<size_t>(1), items / default_tile_count_);
+    tile_size = std::max(static_cast<int64_t>(1), items / default_tile_count_);
   }
 
-  size_t step_tile_size = step * tile_size;
-  size_t tile_count = RoundUpDiv(items, tile_size);
-  Run([&](size_t tile_idx) {
-    size_t tile_start = start + tile_idx * step_tile_size;
-    size_t tile_end = std::min(end, tile_start + step_tile_size);
+  int64_t step_tile_size = step * tile_size;
+  int64_t tile_count = RoundUpDiv(items, tile_size);
+  Run([&](int64_t tile_idx) {
+    int64_t tile_start = start + tile_idx * step_tile_size;
+    int64_t tile_end = std::min(end, tile_start + step_tile_size);
     func(tile_start, tile_end, step);
   }, tile_count);
 }
 
-void ThreadPool::Compute2D(const std::function<void(size_t /* start */,
-                                                    size_t /* end */,
-                                                    size_t /* step */,
-                                                    size_t /* start */,
-                                                    size_t /* end */,
-                                                    size_t /* step */)> &func,
-                           size_t start0,
-                           size_t end0,
-                           size_t step0,
-                           size_t start1,
-                           size_t end1,
-                           size_t step1,
-                           size_t tile_size0,
-                           size_t tile_size1,
-                           int cost_per_item) {
+void ThreadPool::Compute2D(const std::function<void(const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t)> &func,
+                           const int64_t start0,
+                           const int64_t end0,
+                           const int64_t step0,
+                           const int64_t start1,
+                           const int64_t end1,
+                           const int64_t step1,
+                           int64_t tile_size0,
+                           int64_t tile_size1,
+                           const int cost_per_item) {
   if (start0 >= end0 || start1 >= end1) {
     return;
   }
 
-  size_t items0 = 1 + (end0 - start0 - 1) / step0;
-  size_t items1 = 1 + (end1 - start1 - 1) / step1;
+  int64_t items0 = 1 + (end0 - start0 - 1) / step0;
+  int64_t items1 = 1 + (end1 - start1 - 1) / step1;
   if (threads_.size() <= 1 || (cost_per_item >= 0
       && items0 * items1 * cost_per_item < kMaxCostUsingSingleThread)) {
     func(start0, end0, step0, start1, end1, step1);
@@ -359,56 +359,56 @@ void ThreadPool::Compute2D(const std::function<void(size_t /* start */,
       tile_size1 = items1;
     } else {
       tile_size0 = 1;
-      tile_size1 = std::max(static_cast<size_t>(1),
+      tile_size1 = std::max(static_cast<int64_t>(1),
                             items1 * items0 / default_tile_count_);
     }
   }
 
-  size_t step_tile_size0 = step0 * tile_size0;
-  size_t step_tile_size1 = step1 * tile_size1;
-  size_t tile_count0 = RoundUpDiv(items0, tile_size0);
-  size_t tile_count1 = RoundUpDiv(items1, tile_size1);
-
-  Run([&](size_t tile_idx) {
-    size_t tile_idx0 = tile_idx / tile_count1;
-    size_t tile_idx1 = tile_idx - tile_idx0 * tile_count1;
-    size_t tile_start0 = start0 + tile_idx0 * step_tile_size0;
-    size_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0);
-    size_t tile_start1 = start1 + tile_idx1 * step_tile_size1;
-    size_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1);
+  int64_t step_tile_size0 = step0 * tile_size0;
+  int64_t step_tile_size1 = step1 * tile_size1;
+  int64_t tile_count0 = RoundUpDiv(items0, tile_size0);
+  int64_t tile_count1 = RoundUpDiv(items1, tile_size1);
+
+  Run([&](int64_t tile_idx) {
+    int64_t tile_idx0 = tile_idx / tile_count1;
+    int64_t tile_idx1 = tile_idx - tile_idx0 * tile_count1;
+    int64_t tile_start0 = start0 + tile_idx0 * step_tile_size0;
+    int64_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0);
+    int64_t tile_start1 = start1 + tile_idx1 * step_tile_size1;
+    int64_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1);
     func(tile_start0, tile_end0, step0, tile_start1, tile_end1, step1);
   }, tile_count0 * tile_count1);
 }
 
-void ThreadPool::Compute3D(const std::function<void(size_t /* start */,
-                                                    size_t /* end */,
-                                                    size_t /* step */,
-                                                    size_t /* start */,
-                                                    size_t /* end */,
-                                                    size_t /* step */,
-                                                    size_t /* start */,
-                                                    size_t /* end */,
-                                                    size_t /* step */)> &func,
-                           size_t start0,
-                           size_t end0,
-                           size_t step0,
-                           size_t start1,
-                           size_t end1,
-                           size_t step1,
-                           size_t start2,
-                           size_t end2,
-                           size_t step2,
-                           size_t tile_size0,
-                           size_t tile_size1,
-                           size_t tile_size2,
-                           int cost_per_item) {
+void ThreadPool::Compute3D(const std::function<void(const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t,
+                                                    const int64_t)> &func,
+                           const int64_t start0,
+                           const int64_t end0,
+                           const int64_t step0,
+                           const int64_t start1,
+                           const int64_t end1,
+                           const int64_t step1,
+                           const int64_t start2,
+                           const int64_t end2,
+                           const int64_t step2,
+                           int64_t tile_size0,
+                           int64_t tile_size1,
+                           int64_t tile_size2,
+                           const int cost_per_item) {
   if (start0 >= end0 || start1 >= end1 || start2 >= end1) {
     return;
   }
 
-  size_t items0 = 1 + (end0 - start0 - 1) / step0;
-  size_t items1 = 1 + (end1 - start1 - 1) / step1;
-  size_t items2 = 1 + (end2 - start2 - 1) / step2;
+  int64_t items0 = 1 + (end0 - start0 - 1) / step0;
+  int64_t items1 = 1 + (end1 - start1 - 1) / step1;
+  int64_t items2 = 1 + (end2 - start2 - 1) / step2;
   if (threads_.size() <= 1 || (cost_per_item >= 0
       && items0 * items1 * items2 * cost_per_item
           < kMaxCostUsingSingleThread)) {
@@ -423,37 +423,37 @@ void ThreadPool::Compute3D(const std::function<void(size_t /* start */,
       tile_size2 = items2;
     } else {
       tile_size0 = 1;
-      size_t items01 = items1 * items0;
+      int64_t items01 = items1 * items0;
       if (items01 >= default_tile_count_) {
         tile_size1 = items01 / default_tile_count_;
         tile_size2 = items2;
       } else {
         tile_size1 = 1;
-        tile_size2 = std::max(static_cast<size_t>(1),
+        tile_size2 = std::max(static_cast<int64_t>(1),
                               items01 * items2 / default_tile_count_);
       }
     }
   }
 
-  size_t step_tile_size0 = step0 * tile_size0;
-  size_t step_tile_size1 = step1 * tile_size1;
-  size_t step_tile_size2 = step2 * tile_size2;
-  size_t tile_count0 = RoundUpDiv(items0, tile_size0);
-  size_t tile_count1 = RoundUpDiv(items1, tile_size1);
-  size_t tile_count2 = RoundUpDiv(items2, tile_size2);
-  size_t tile_count12 = tile_count1 * tile_count2;
-
-  Run([&](size_t tile_idx) {
-    size_t tile_idx0 = tile_idx / tile_count12;
-    size_t tile_idx12 = tile_idx - tile_idx0 * tile_count12;
-    size_t tile_idx1 = tile_idx12 / tile_count2;
-    size_t tile_idx2 = tile_idx12 - tile_idx1 * tile_count2;
-    size_t tile_start0 = start0 + tile_idx0 * step_tile_size0;
-    size_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0);
-    size_t tile_start1 = start1 + tile_idx1 * step_tile_size1;
-    size_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1);
-    size_t tile_start2 = start2 + tile_idx2 * step_tile_size2;
-    size_t tile_end2 = std::min(end2, tile_start2 + step_tile_size2);
+  int64_t step_tile_size0 = step0 * tile_size0;
+  int64_t step_tile_size1 = step1 * tile_size1;
+  int64_t step_tile_size2 = step2 * tile_size2;
+  int64_t tile_count0 = RoundUpDiv(items0, tile_size0);
+  int64_t tile_count1 = RoundUpDiv(items1, tile_size1);
+  int64_t tile_count2 = RoundUpDiv(items2, tile_size2);
+  int64_t tile_count12 = tile_count1 * tile_count2;
+
+  Run([&](int64_t tile_idx) {
+    int64_t tile_idx0 = tile_idx / tile_count12;
+    int64_t tile_idx12 = tile_idx - tile_idx0 * tile_count12;
+    int64_t tile_idx1 = tile_idx12 / tile_count2;
+    int64_t tile_idx2 = tile_idx12 - tile_idx1 * tile_count2;
+    int64_t tile_start0 = start0 + tile_idx0 * step_tile_size0;
+    int64_t tile_end0 = std::min(end0, tile_start0 + step_tile_size0);
+    int64_t tile_start1 = start1 + tile_idx1 * step_tile_size1;
+    int64_t tile_end1 = std::min(end1, tile_start1 + step_tile_size1);
+    int64_t tile_start2 = start2 + tile_idx2 * step_tile_size2;
+    int64_t tile_end2 = std::min(end2, tile_start2 + step_tile_size2);
     func(tile_start0,
          tile_end0,
          step0,
diff --git a/mace/utils/thread_pool.h b/mace/utils/thread_pool.h
index 67fa89cf112cc3b3fc55124a73ef8bb63633e57c..90d30257bf66da0b7d6d82776b87071779396b9f 100644
--- a/mace/utils/thread_pool.h
+++ b/mace/utils/thread_pool.h
@@ -37,54 +37,55 @@ class ThreadPool {
 
   void Init();
 
-  void Run(const std::function<void(size_t)> &func, size_t iterations);
-
-  void Compute1D(const std::function<void(size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */)> &func,
-                 size_t start,
-                 size_t end,
-                 size_t step,
-                 size_t tile_size = 0,
+  void Run(const std::function<void(const int64_t)> &func,
+           const int64_t iterations);
+
+  void Compute1D(const std::function<void(int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */)> &func,
+                 int64_t start,
+                 int64_t end,
+                 int64_t step,
+                 int64_t tile_size = 0,
                  int cost_per_item = -1);
 
-  void Compute2D(const std::function<void(size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */,
-                                          size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */)> &func,
-                 size_t start0,
-                 size_t end0,
-                 size_t step0,
-                 size_t start1,
-                 size_t end1,
-                 size_t step1,
-                 size_t tile_size0 = 0,
-                 size_t tile_size1 = 0,
+  void Compute2D(const std::function<void(int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */,
+                                          int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */)> &func,
+                 int64_t start0,
+                 int64_t end0,
+                 int64_t step0,
+                 int64_t start1,
+                 int64_t end1,
+                 int64_t step1,
+                 int64_t tile_size0 = 0,
+                 int64_t tile_size1 = 0,
                  int cost_per_item = -1);
 
-  void Compute3D(const std::function<void(size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */,
-                                          size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */,
-                                          size_t /* start */,
-                                          size_t /* end */,
-                                          size_t /* step */)> &func,
-                 size_t start0,
-                 size_t end0,
-                 size_t step0,
-                 size_t start1,
-                 size_t end1,
-                 size_t step1,
-                 size_t start2,
-                 size_t end2,
-                 size_t step2,
-                 size_t tile_size0 = 0,
-                 size_t tile_size1 = 0,
-                 size_t tile_size2 = 0,
+  void Compute3D(const std::function<void(int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */,
+                                          int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */,
+                                          int64_t /* start */,
+                                          int64_t /* end */,
+                                          int64_t /* step */)> &func,
+                 int64_t start0,
+                 int64_t end0,
+                 int64_t step0,
+                 int64_t start1,
+                 int64_t end1,
+                 int64_t step1,
+                 int64_t start2,
+                 int64_t end2,
+                 int64_t step2,
+                 int64_t tile_size0 = 0,
+                 int64_t tile_size1 = 0,
+                 int64_t tile_size2 = 0,
                  int cost_per_item = -1);
 
  private:
@@ -100,16 +101,16 @@ class ThreadPool {
   std::mutex run_mutex_;
 
   struct ThreadInfo {
-    size_t range_start;
-    std::atomic<size_t> range_end;
-    std::atomic<size_t> range_len;
+    std::atomic<int64_t> range_start;
+    std::atomic<int64_t> range_end;
+    std::atomic<int64_t> range_len;
     uintptr_t func;
     std::vector<size_t> cpu_cores;
   };
   std::vector<ThreadInfo> thread_infos_;
   std::vector<std::thread> threads_;
 
-  size_t default_tile_count_;
+  int64_t default_tile_count_;
 };
 
 }  // namespace utils
diff --git a/mace/utils/thread_pool_test.cc b/mace/utils/thread_pool_test.cc
index 281b335b42b24d0b86e5c6816d03a50cb8845633..a6d5cb04d4f6122b1e18774a737aa470e7261f25 100644
--- a/mace/utils/thread_pool_test.cc
+++ b/mace/utils/thread_pool_test.cc
@@ -30,27 +30,29 @@ class ThreadPoolTest : public ::testing::Test {
   ThreadPool thread_pool;
 };
 
-void Test1D(size_t start, size_t end, size_t step, std::vector<int> *res) {
-  for (size_t i = start; i < end; i += step) {
+void Test1D(int64_t start, int64_t end, int64_t step, std::vector<int> *res) {
+  for (int64_t i = start; i < end; i += step) {
     (*res)[i]++;
   }
 }
 
-void Test2D(size_t start0, size_t end0, size_t step0,
-            size_t start1, size_t end1, size_t step1, std::vector<int> *res) {
-  for (size_t i = start0; i < end0; i += step0) {
-    for (size_t j = start1; j < end1; j += step1) {
+void Test2D(int64_t start0, int64_t end0, int64_t step0,
+            int64_t start1, int64_t end1, int64_t step1,
+            std::vector<int> *res) {
+  for (int64_t i = start0; i < end0; i += step0) {
+    for (int64_t j = start1; j < end1; j += step1) {
       (*res)[i * 100 + j]++;
     }
   }
 }
 
-void Test3D(size_t start0, size_t end0, size_t step0,
-            size_t start1, size_t end1, size_t step1,
-            size_t start2, size_t end2, size_t step2, std::vector<int> *res) {
-  for (size_t i = start0; i < end0; i += step0) {
-    for (size_t j = start1; j < end1; j += step1) {
-      for (size_t k = start2; k < end2; k += step2) {
+void Test3D(int64_t start0, int64_t end0, int64_t step0,
+            int64_t start1, int64_t end1, int64_t step1,
+            int64_t start2, int64_t end2, int64_t step2,
+            std::vector<int> *res) {
+  for (int64_t i = start0; i < end0; i += step0) {
+    for (int64_t j = start1; j < end1; j += step1) {
+      for (int64_t k = start2; k < end2; k += step2) {
         (*res)[(i * 100 + j) * 100 + k]++;
       }
     }
@@ -58,47 +60,47 @@ void Test3D(size_t start0, size_t end0, size_t step0,
 }
 
 TEST_F(ThreadPoolTest, Compute1D) {
-  size_t test_size = 100;
+  int64_t test_size = 100;
   std::vector<int> actual(test_size, 0);
-  thread_pool.Compute1D([&](size_t start, size_t end, size_t step) {
+  thread_pool.Compute1D([&](int64_t start, int64_t end, int64_t step) {
     Test1D(start, end, step, &actual);
   }, 0, test_size, 2);
   std::vector<int> expected(test_size, 0);
   Test1D(0, test_size, 2, &expected);
 
-  for (size_t i = 0; i < test_size; ++i) {
+  for (int64_t i = 0; i < test_size; ++i) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
 
 TEST_F(ThreadPoolTest, Compute2D) {
-  size_t test_size = 100;
+  int64_t test_size = 100;
   std::vector<int> actual(test_size * test_size, 0);
-  thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0,
-                             size_t start1, size_t end1, size_t step1) {
+  thread_pool.Compute2D([&](int64_t start0, int64_t end0, int64_t step0,
+                             int64_t start1, int64_t end1, int64_t step1) {
     Test2D(start0, end0, step0, start1, end1, step1, &actual);
   }, 0, test_size, 2, 0, test_size, 2);
   std::vector<int> expected(test_size * test_size, 0);
   Test2D(0, test_size, 2, 0, test_size, 2, &expected);
 
-  for (size_t i = 0; i < test_size * test_size; ++i) {
+  for (int64_t i = 0; i < test_size * test_size; ++i) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
 
 TEST_F(ThreadPoolTest, Compute3D) {
-  size_t test_size = 100;
+  int64_t test_size = 100;
   std::vector<int> actual(test_size * test_size * test_size, 0);
-  thread_pool.Compute3D([&](size_t start0, size_t end0, size_t step0,
-                             size_t start1, size_t end1, size_t step1,
-                             size_t start2, size_t end2, size_t step2) {
+  thread_pool.Compute3D([&](int64_t start0, int64_t end0, int64_t step0,
+                             int64_t start1, int64_t end1, int64_t step1,
+                             int64_t start2, int64_t end2, int64_t step2) {
     Test3D(start0, end0, step0, start1, end1, step1, start2, end2, step2,
            &actual);
   }, 0, test_size, 2, 0, test_size, 2, 0, test_size, 2);
   std::vector<int> expected(test_size * test_size * test_size, 0);
   Test3D(0, test_size, 2, 0, test_size, 2, 0, test_size, 2, &expected);
 
-  for (size_t i = 0; i < test_size * test_size * test_size; ++i) {
+  for (int64_t i = 0; i < test_size * test_size * test_size; ++i) {
     EXPECT_EQ(expected[i], actual[i]);
   }
 }
diff --git a/tools/bazel.rc b/tools/bazel.rc
index ef5fd59791bcb68cb0bc1ffc75ad936b7f3d58c4..0a49d2cebd18add2bab3b7eae341b730acf771dd 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -8,7 +8,6 @@ build --copt=-std=c++11
 build --copt=-fPIC
 build --copt=-D_GLIBCXX_USE_C99_MATH_TR1
 build --copt=-DMACE_OBFUSCATE_LITERALS
-build --copt=-DGEMMLOWP_USE_OPENMP
 
 # Usage example: bazel build --config android
 build:android --linkopt=-pie
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 7083d3180b94acca93778ad33667bb7869b46f17..dec4e21f566490c4dc8f4cf218f2999a25e44d3a 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -107,8 +107,8 @@ def parse_args():
     parser.add_argument(
         "--enable_openmp",
         type=str2bool,
-        default=True,
-        help="Disable openmp for multiple thread.")
+        default=False,
+        help="Whether to use openmp")
     parser.add_argument(
         '--address_sanitizer',
         action="store_true",
@@ -143,9 +143,9 @@ def main(unused_args):
             abi=target_abi,
             toolchain=toolchain,
             enable_neon=FLAGS.enable_neon,
+            enable_openmp=FLAGS.enable_openmp,
             address_sanitizer=FLAGS.address_sanitizer,
-            debug_mode=FLAGS.debug_mode,
-            enable_openmp=FLAGS.enable_openmp)
+            debug_mode=FLAGS.debug_mode)
         if FLAGS.run_target:
             target_devices = DeviceManager.list_devices(FLAGS.device_yml)
             if FLAGS.target_socs != TargetSOCTag.all and\
diff --git a/tools/build-standalone-lib.sh b/tools/build-standalone-lib.sh
index 5d35e541a5be5252ff257865bbfaddac464925e7..f812450d1371b5bb7b5a713dad3a84da28d4ea9b 100755
--- a/tools/build-standalone-lib.sh
+++ b/tools/build-standalone-lib.sh
@@ -33,67 +33,67 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
 
 # build shared libraries
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 
 echo "build shared lib for arm64-v8a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 
 echo "build shared lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu/
 
 echo "build shared lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build shared lib for arm_linux_gnueabihf + cpu_gpu"
-bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=false --define opencl=true --define quantize=true
 cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
 
 echo "build shared lib for aarch64_linux_gnu + cpu_gpu"
-bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=true --define opencl=true --define quantize=true
+bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=false --define opencl=true --define quantize=true
 cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
 
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build shared lib for linux-x86-64"
-	bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=true
+	bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=false
 	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/linux-x86-64/
 fi
 
 # build static libraries
 echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 
 echo "build static lib for arm64-v8a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 
 echo "build static lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu/
 
 echo "build static lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build static lib for arm_linux_gnueabihf + cpu_gpu"
-bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
 
 echo "build static lib for aarch64_linux_gnu + cpu_gpu"
-bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true
+bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=false --define opencl=true --define quantize=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
 
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build static lib for linux-x86-64"
-	bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=true
+	bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=false
 	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/linux-x86-64/
 fi
 
diff --git a/tools/converter.py b/tools/converter.py
index 7bf387bd5835517d0b2c524d709febaf89fee175..a5df88a9cecd8493b26b6462b33a9aaff729f99b 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -983,7 +983,7 @@ def run_mace(flags):
                     build_example(configs,
                                   target_abi,
                                   toolchain,
-                                  not flags.disable_openmp,
+                                  flags.enable_openmp,
                                   flags.mace_lib_type,
                                   flags.cl_binary_to_code,
                                   device,
@@ -992,7 +992,7 @@ def run_mace(flags):
                     build_mace_run(configs,
                                    target_abi,
                                    toolchain,
-                                   not flags.disable_openmp,
+                                   flags.enable_openmp,
                                    flags.address_sanitizer,
                                    flags.mace_lib_type,
                                    flags.debug_mode)
@@ -1081,7 +1081,7 @@ def benchmark_model(flags):
                 build_benchmark_model(configs,
                                       target_abi,
                                       toolchain,
-                                      not flags.disable_openmp,
+                                      flags.enable_openmp,
                                       flags.mace_lib_type,
                                       flags.debug_mode)
                 device = DeviceWrapper(dev)
@@ -1171,9 +1171,9 @@ def parse_args():
         default=DefaultValues.mace_lib_type,
         help="[static | dynamic], Which type MACE library to use.")
     run_bm_parent_parser.add_argument(
-        "--disable_openmp",
+        "--enable_openmp",
         action="store_true",
-        help="Disable openmp for multiple thread.")
+        help="Enable openmp for multiple thread.")
     run_bm_parent_parser.add_argument(
         "--omp_num_threads",
         type=int,
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 44047c26518ea0ba00a3ea58a384b132992cc284..3b98c7a691bf6a047bdc91bfd4c90cc36d336d4e 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -265,7 +265,7 @@ def bazel_build(target,
                 toolchain='android',
                 enable_hexagon=False,
                 enable_hta=False,
-                enable_openmp=True,
+                enable_openmp=False,
                 enable_neon=True,
                 enable_opencl=True,
                 enable_quantize=True,