diff --git a/mace/core/BUILD b/mace/core/BUILD
index 5adf9010ec01e7d69292a92de492eaaf64f4654c..77a3e0dd8ae3c9a21aa0ebc973867cba07910cfd 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -47,24 +47,33 @@ cc_library(
     srcs = glob(
         ["*.cc",],
         exclude=[
-            "logging.cc"
+            "logging.cc",
+            "opencl_allocator.cc",
+        ]) + if_android([
+            "opencl_allocator.cc",
         ]),
     hdrs = glob(
         ["*.h"],
         exclude=[
-            "logging.h"
+            "logging.h",
+            "opencl_allocator.h",
+        ]) + if_android([
+            "opencl_allocator.h",
         ]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11"] + if_android([
+		"-D__USE_OPENCL",
+    ]),
     linkopts = if_android([
         "-pie",
     ]),
     deps = [
         ":logging",
-        ":opencl_runtime",
         "//mace/proto:cc_proto",
         "//mace/proto:stats_proto",
         "//mace/utils",
-    ],
+    ] + if_android([
+        ":opencl_runtime",
+    ]),
 )
 
 # Main program for tests
diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc
index 707ea4cb0e0a3dd267e229b6a7f52e39d42e9773..84bdeb86fd87f66ef5caee92cc959f84bd19a197 100644
--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -3,7 +3,9 @@
 //
 
 #include "mace/core/allocator.h"
+#ifdef __USE_OPENCL
 #include "mace/core/opencl_allocator.h"
+#endif
 
 namespace mace {
 
@@ -23,6 +25,8 @@ Allocator *GetDeviceAllocator(DeviceType type) {
 
 MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
 MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
+#ifdef __USE_OPENCL
 MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
+#endif
 
 }  // namespace mace
diff --git a/mace/core/net.cc b/mace/core/net.cc
index f93089a18a2e18cc0c147b8df1e94fe79538d17c..bd3d45c20a631571354122ee64d84cc73ca686f9 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -4,7 +4,9 @@
 
 #include "mace/core/net.h"
 #include "mace/utils/utils.h"
+#ifdef __USE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif
 
 namespace mace {
 
diff --git a/mace/dsp/BUILD b/mace/dsp/BUILD
index 40e81b05e5d6ad15837c36690e4bb69f5fe91c44..814d18744d47da77e28a51c9676acb260d47c23c 100644
--- a/mace/dsp/BUILD
+++ b/mace/dsp/BUILD
@@ -60,5 +60,6 @@ cc_test(
     deps = [
         "@gtest//:gtest_main",
         ":dsp",
+        "//mace/kernels:kernels",
     ],
 )
diff --git a/mace/dsp/test/quantized_add_test.cc b/mace/dsp/test/quantized_add_test.cc
index 3d89f45bfa3f8facf9979ac0f61a11c27c44131a..f30d8424f68c1613064f3c7531b9685a41a0f215 100644
--- a/mace/dsp/test/quantized_add_test.cc
+++ b/mace/dsp/test/quantized_add_test.cc
@@ -175,11 +175,11 @@ TEST(QuantizedAddTest, QuantizedAdd) {
   VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
   wrapper.PrintLog();
 
-  // -120.0~176.47, [17, 146,232]
+  // -120.0~176.47, [17, 146, 229]
+  vector<uint8_t> expected {17, 146, 229};
   for (int i = 0; i < output_tensor.size(); ++i) {
-    std::cout << (int32_t) output_data[i] << " ";
+    EXPECT_EQ(expected[i], output_data[i]);
   }
-  std::cout << std::endl;
 
   VLOG(0) << wrapper.TeardownGraph();
   wrapper.Finalize();
diff --git a/mace/dsp/test/quantized_relu_test.cc b/mace/dsp/test/quantized_relu_test.cc
index c3883d8d3d7fa21e697d571fff584b932f59ae20..685be71d9b51ab3a833579654da568552d310a0f 100644
--- a/mace/dsp/test/quantized_relu_test.cc
+++ b/mace/dsp/test/quantized_relu_test.cc
@@ -121,10 +121,10 @@ TEST(QuantizedReluTest, QuantizedRelu) {
   VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
   wrapper.PrintLog();
 
+  vector<uint8_t> expected {128, 128, 128, 192, 255};
   for (int i = 0; i < output_tensor.size(); ++i) {
-    std::cout << (int32_t) output_data[i] << " ";
+    EXPECT_EQ(expected[i], output_data[i]);
   }
-  std::cout << std::endl;
 
   VLOG(0) << wrapper.TeardownGraph();
   wrapper.Finalize();
diff --git a/mace/dsp/util/BUILD b/mace/dsp/util/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4a75e104fccca2214cd0ffbf014a8c224614d9f4
--- /dev/null
+++ b/mace/dsp/util/BUILD
@@ -0,0 +1,43 @@
+# Description:
+# Mace dsp util.
+#
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//mace:mace.bzl", "if_android")
+
+cc_library(
+    name = "util",
+    srcs = glob([
+        "*.cc",
+    ], exclude = [
+        "*_test.cc",
+    ]),
+    hdrs = glob([
+        "*.h",
+    ]),
+    copts = ["-std=c++11"],
+    deps = [
+        "//mace/core:core",
+    ],
+)
+
+cc_test(
+    name = "util_test",
+    testonly = 1,
+    srcs = glob(["*_test.cc"]),
+    copts = ["-std=c++11"],
+    linkopts = if_android([
+        "-ldl",
+        "-lm",
+    ]),
+    linkstatic = 1,
+    deps = [
+        "@gtest//:gtest_main",
+        ":util",
+    ],
+)
diff --git a/mace/dsp/util/quantize.cc b/mace/dsp/util/quantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42063b4f05f8bfac40d000cbd399c7131baa3d60
--- /dev/null
+++ b/mace/dsp/util/quantize.cc
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/dsp/util/quantize.h"
+
+namespace mace {
+
+void Quantizer::Quantize(const Tensor &in_tensor,
+                         const float min_in,
+                         const float max_in,
+                         Tensor *out_tensor,
+                         float *min_out,
+                         float *max_out) {
+  float stepsize;
+  float recip_stepsize;
+  QuantizeAdjustRange(min_in, max_in,
+                      min_out, max_out,
+                      &stepsize, &recip_stepsize);
+
+  const float *in = in_tensor.data<float>();
+  uint8_t *out = out_tensor->mutable_data<uint8_t>();
+
+  for (int i = 0; i < in_tensor.size(); i++) {
+    const float inval = in[i];
+    float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
+    if (ival < 0) ival = 0;
+    if (ival > 255) ival = 255;
+    out[i] = static_cast<uint8_t>(ival);
+  }
+}
+
+void Quantizer::QuantizeAdjustRange(float min_in,
+                                    float max_in,
+                                    float *min_out,
+                                    float *max_out,
+                                    float *stepsize_out,
+                                    float *recip_stepsize_out) {
+  float minval = std::min(0.0f, min_in);
+  float maxval = std::max(0.0f, max_in);
+  float range = fmaxf(0.0001f, maxval - minval);
+  float stepsize = range / 254.0f;
+  float recip_stepsize = 254.0f / range;
+  // round quantized_zero up so min_out <= minval
+  int quantized_zero = ((0.0f - minval) * recip_stepsize) + 0.999;
+  float newmin = -quantized_zero * stepsize;
+  float newmax = 255.0f * stepsize + newmin;
+  *min_out = newmin;
+  *max_out = newmax;
+  *stepsize_out = stepsize;
+  *recip_stepsize_out = recip_stepsize;
+}
+
+void Quantizer::DeQuantize(const Tensor &in_tensor,
+                           const float min_in,
+                           const float max_in,
+                           Tensor *out_tensor) {
+  float range = std::max(0.0001f, max_in - min_in);
+  float stepsize = range / 255.0f;
+
+  const uint8_t *in = in_tensor.data<uint8_t>();
+  float *out = out_tensor->mutable_data<float>();
+
+  for (int i = 0; i < in_tensor.size(); i++) {
+    out[i] = (in[i] * stepsize) + min_in;
+  }
+}
+
+} // namespace mace
\ No newline at end of file
diff --git a/mace/dsp/util/quantize.h b/mace/dsp/util/quantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..316fdaed72b216a0cd009fe19ea84facc508fef4
--- /dev/null
+++ b/mace/dsp/util/quantize.h
@@ -0,0 +1,37 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_DSP_UTIL_QUANTIZE_H_
+#define MACE_DSP_UTIL_QUANTIZE_H_
+
+#include "mace/core/common.h"
+#include "mace/core/tensor.h"
+
+namespace mace {
+
+class Quantizer {
+ public:
+  Quantizer() {}
+  ~Quantizer() {}
+
+  void Quantize(const Tensor &in_tensor,
+                const float min_in, const float max_in,
+                Tensor *out_tensor,
+                float *min_out, float *max_out);
+  void DeQuantize(const Tensor &in_tensor,
+                  const float min_in, const float max_in,
+                  Tensor *out_tensor);
+
+ private:
+  void QuantizeAdjustRange(float min_in,
+                           float max_in,
+                           float *min_out,
+                           float *max_out,
+                           float *stepsize,
+                           float *recip_stepsize);
+};
+
+} // mace
+
+#endif // MACE_DSP_UTIL_QUANTIZE_H_
diff --git a/mace/dsp/util/quantize_test.cc b/mace/dsp/util/quantize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aca1eadb98bd675b123967ca88638299a487ef74
--- /dev/null
+++ b/mace/dsp/util/quantize_test.cc
@@ -0,0 +1,45 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/dsp/util/quantize.h"
+#include "gtest/gtest.h"
+
+using namespace mace;
+
+TEST(QuantizeTest, QuantizeAndDequantize) {
+  testing::internal::LogToStderr();
+
+  Quantizer quantizer;
+  Allocator *allocator = GetDeviceAllocator(DeviceType::CPU);
+
+  Tensor in_tensor(allocator, DataType::DT_FLOAT);
+  vector<index_t> shape {5};
+  in_tensor.Resize(shape);
+  float *in_data = in_tensor.mutable_data<float>();
+  in_data[0] = -50.0;
+  in_data[1] = -10.0;
+  in_data[2] = 20.0;
+  in_data[3] = 80.0;
+  in_data[4] = 100.0;
+
+  Tensor quantized_tensor(allocator, DataType::DT_UINT8);
+  quantized_tensor.Resize(shape);
+  uint8_t *quantized_data = quantized_tensor.mutable_data<uint8_t>();
+  float min_out, max_out;
+  quantizer.Quantize(in_tensor, -50.0, 100.0, &quantized_tensor, &min_out, &max_out);
+  vector<uint8_t> expected_quantize_data {0, 68, 119, 220, 254};
+  for (int i = 0; i < quantized_tensor.size(); ++i) {
+    EXPECT_EQ(expected_quantize_data[i], quantized_data[i]);
+  }
+
+  Tensor dequantized_tensor(allocator, DataType::DT_FLOAT);
+  dequantized_tensor.Resize(shape);
+  float *dequantized_data = dequantized_tensor.mutable_data<float>();
+  quantizer.DeQuantize(quantized_tensor, min_out, max_out, &dequantized_tensor);
+
+  for (int i = 0; i < dequantized_tensor.size(); ++i) {
+    EXPECT_NEAR(in_data[i], dequantized_data[i], 1);
+  }
+}
+
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index 7a3f3007e31915e1652b7e8e72b0e16d58e99bcd..bd79ac2bbadc33beb45849300bcc82cda89a8723 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -8,11 +8,12 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//mace:mace.bzl", "if_android")
+load("//mace:mace.bzl", "if_android_arm64")
 
 cc_library(
     name = "kernels",
-    srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc", "opencl/*.cc"])),
-    hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h", "opencl/*.h"])),
+    srcs = glob(["*.cc"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])),
+    hdrs = glob(["*.h"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])),
     copts = [
         "-std=c++11",
         "-fopenmp",