diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel
index 14ddc6b6cfe215497d32b9a5509a3cc1835ea7a6..2c905389aec20c51a2e54672411c1a696ff1b3b9 100644
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -10,11 +10,12 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//mace:mace.bzl",
     "if_android",
+    "if_android_armv7",
     "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
-    "if_openmp_enabled",
     "if_neon_enabled",
+    "if_not_hexagon_enabled",
     "if_opencl_enabled",
+    "if_openmp_enabled",
     "if_quantize_enabled",
 )
 
@@ -58,6 +59,9 @@ cc_library(
         "-DMACE_ENABLE_HEXAGON",
     ]) + if_neon_enabled([
         "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
     ]),
     linkopts = ["-ldl"],
     deps = [
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 76cd33e231b97fd2becd6453f01c0f48c085b9b8..5db5b36b1bb8bd2d2399f1cfa4ba406e78654a40 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -40,19 +40,33 @@ struct CPUFreq {
   float freq;
 };
 
+enum SchedulePolicy {
+  SCHED_STATIC,
+  SCHED_GUIDED,
+};
+
 namespace {
 
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<size_t> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids,
+                                           SchedulePolicy schedule_policy) {
   MaceOpenMPThreadCount = omp_num_threads;
 
 #ifdef MACE_ENABLE_OPENMP
   VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
           << ", CPU core IDs: " << MakeString(cpu_ids);
-  omp_set_schedule(omp_sched_guided, 1);
+  if (schedule_policy == SCHED_GUIDED) {
+    omp_set_schedule(omp_sched_guided, 1);
+  } else if (schedule_policy == SCHED_STATIC) {
+    omp_set_schedule(omp_sched_static, 0);
+  } else {
+    LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
+  }
+
   omp_set_num_threads(omp_num_threads);
 #else
   MACE_UNUSED(omp_num_threads);
+  MACE_UNUSED(schedule_policy);
   LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
 
@@ -148,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
   } else {
     cores_to_use = num_threads_hint;
   }
+  MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
 
   VLOG(2) << "Use " << num_threads_hint << " threads";
   std::vector<size_t> cpu_ids(cores_to_use);
@@ -156,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
             << cpu_freq[i].freq;
     cpu_ids[i] = cpu_freq[i].core_id;
   }
+  SchedulePolicy sched_policy = SCHED_GUIDED;
+  if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
+    sched_policy = SCHED_STATIC;
+  }
 
 #ifdef MACE_ENABLE_QUANTIZE
   if (gemm_context) {
@@ -164,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
   }
 #endif  // MACE_ENABLE_QUANTIZE
 
-  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
+                                         cpu_ids,
+                                         sched_policy);
 }
 
 }  // namespace mace
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 256a397c7e76b794d2ae33b28984660ecf20e5d0..02fc3a12ace0808e083482ea93ae9fe2d6a7c65a 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1942,13 +1942,21 @@ class Transformer(base_converter.ConverterInterface):
                 continue
 
             quantized_inputs_names = []
+
             should_quantize = False
+            has_const = False
+            for idx, input_tensor in enumerate(op.input):
+                if input_tensor in self._consts:
+                    has_const = True
+                    break
+            if not has_const:
+                continue
+
             for idx, input_tensor in enumerate(op.input):
                 if self.get_tensor_data_type(input_tensor) \
                         == mace_pb2.DT_FLOAT:
                     should_quantize = True
                     break
-
             if not should_quantize:
                 continue
             else:
diff --git a/mace/utils/BUILD.bazel b/mace/utils/BUILD.bazel
index 3207441f099ab3f4bccd080ccfe043c88fa37f57..378210a3905e68188a8de35d2b1a8b1dacdefd39 100644
--- a/mace/utils/BUILD.bazel
+++ b/mace/utils/BUILD.bazel
@@ -7,6 +7,14 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "//mace:mace.bzl",
+    "if_android",
+    "if_android_armv7",
+    "if_neon_enabled",
+    "if_openmp_enabled",
+)
+
 cc_library(
     name = "utils_hdrs",
     hdrs = glob([
@@ -37,7 +45,17 @@ cc_library(
         "-Werror",
         "-Wextra",
         "-Wno-missing-field-initializers",
-    ],
+    ] + if_openmp_enabled([
+        "-fopenmp",
+    ]) + if_neon_enabled([
+        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ]),
+    linkopts = if_android([
+        "-llog",
+    ]),
     deps = [
         ":utils_hdrs",
     ],
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
index afd5a9af5bc7d57cfa1da8cca38e80b43d88f625..7634833cc1e75763d79901f68b47f46705fa97db 100644
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -19,6 +19,10 @@
 #include <cmath>
 #include <limits>
 
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif  // MACE_ENABLE_NEON
+
 #include "mace/utils/logging.h"
 
 namespace mace {
@@ -156,6 +160,106 @@ inline void Dequantize(const T *input,
   }
 }
 
+#if defined(MACE_ENABLE_NEON)
+template<>
+inline void QuantizeWithScaleAndZeropoint<uint8_t>(const float *input,
+                                                   const index_t size,
+                                                   float scale,
+                                                   int32_t zero_point,
+                                                   uint8_t *output) {
+  const float32x4_t vround = vdupq_n_f32(0.5);
+  const float32x4_t
+      vzero = vaddq_f32(vround, vcvtq_f32_s32(vdupq_n_s32(zero_point)));
+  const float recip_scale = 1.f / scale;
+  const float32x4_t vrecip_scale = vdupq_n_f32(recip_scale);
+  const index_t block_count = size / 16;
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    float32x4_t vi0 = vld1q_f32(input + i * 16);
+    float32x4_t vi1 = vld1q_f32(input + i * 16 + 4);
+    float32x4_t vi2 = vld1q_f32(input + i * 16 + 8);
+    float32x4_t vi3 = vld1q_f32(input + i * 16 + 12);
+
+    int32x4_t vo0_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi0, vrecip_scale));
+    int32x4_t vo1_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi1, vrecip_scale));
+    int32x4_t vo2_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi2, vrecip_scale));
+    int32x4_t vo3_s32 = vcvtq_s32_f32(vmlaq_f32(vzero, vi3, vrecip_scale));
+
+    uint8x8_t vo0_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo0_s32), vqmovn_s32(vo1_s32)));
+    uint8x8_t vo1_u8 =
+        vqmovun_s16(vcombine_s16(vqmovn_s32(vo2_s32), vqmovn_s32(vo3_s32)));
+    uint8x16_t vo = vcombine_u8(vo0_u8, vo1_u8);
+
+    vst1q_u8(output + i * 16, vo);
+  }
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = Saturate<uint8_t>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<>
+inline void Dequantize<int32_t>(const int32_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 4;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    int32x4_t vi = vld1q_s32(input + i * 4);
+    float32x4_t vo = vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vi, vzero)));
+    vst1q_f32(output + i * 4, vo);
+  }
+  for (index_t i = block_count * 4; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+template<>
+inline void Dequantize<uint8_t>(const uint8_t *input,
+                                const index_t size,
+                                const float scale,
+                                const int32_t zero_point,
+                                float *output) {
+  const index_t block_count = size / 16;
+  const int32x4_t vzero = vdupq_n_s32(zero_point);
+  const float32x4_t vscale = vdupq_n_f32(scale);
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i < block_count; ++i) {
+    uint8x16_t vi = vld1q_u8(input + i * 16);
+    float32x4x4_t vo = {
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_low_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+        vmulq_f32(vscale,
+                  vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
+                      vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
+    };
+    vst1q_f32(output + i * 16, vo.val[0]);
+    vst1q_f32(output + i * 16 + 4, vo.val[1]);
+    vst1q_f32(output + i * 16 + 8, vo.val[2]);
+    vst1q_f32(output + i * 16 + 12, vo.val[3]);
+  }
+  for (index_t i = block_count * 16; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+#endif  // MACE_ENABLE_NEON
+
 template<typename T>
 inline void DeQuantize(const Tensor &input,
                        const float min_in,
@@ -175,8 +279,8 @@ inline void DeQuantize(const Tensor &input,
 }
 
 inline void QuantizeMultiplier(double multiplier,
-                               int32_t* output_multiplier,
-                               int32_t* shift) {
+                               int32_t *output_multiplier,
+                               int32_t *shift) {
   const double q = std::frexp(multiplier, shift);
   auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
   if (qint == (1ll << 31)) {