diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index a497309f0ff18d6eed046c0a031caa2808671279..713a6d1e8d1755c485264c553cd795f304ddd3aa 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -142,7 +142,8 @@ class Tensor {
     buffer_ = &buffer_slice_;
   }
 
-  Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
+  explicit Tensor(bool is_weight = false)
+      : Tensor(GetCPUAllocator(), DT_FLOAT, is_weight) {}
 
   ~Tensor() {
     if (is_buffer_owner_ && buffer_ != nullptr) {
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 07480ad4f3fc73f85492b53bc9e2270e766f0a24..4a34cd44fd4d53e64d618ce33988e69160497bbd 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -14,13 +14,12 @@
 
 #include "mace/core/workspace.h"
 
-#include <memory>
-#include <string>
-#include <vector>
 #include <unordered_set>
 #include <utility>
 
 #include "mace/core/arg_helper.h"
+#include "mace/utils/quantize.h"
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif
@@ -34,6 +33,15 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
   };
   return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
 }
+
+bool HasQuantizeOp(const NetDef &net_def) {
+  for (auto &op : net_def.op()) {
+    if (op.type() == "Quantize") {
+      return true;
+    }
+  }
+  return false;
+}
 }  // namespace
 
 Workspace::Workspace() :
@@ -146,6 +154,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                              0, model_data_size);
         tensor_buffer_->UnMap();
       }
+      bool has_quantize_op = HasQuantizeOp(net_def);
       for (auto &const_tensor : net_def.tensors()) {
         MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
         VLOG(3) << "Tensor name: " << const_tensor.name()
@@ -163,11 +172,27 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                 const_tensor.data_size() *
                     GetEnumTypeSize(const_tensor.data_type())),
                        const_tensor.data_type(), true));
-
         tensor->Reshape(dims);
         tensor->SetScale(const_tensor.scale());
         tensor->SetZeroPoint(const_tensor.zero_point());
-        tensor_map_[const_tensor.name()] = std::move(tensor);
+
+        // Only weights are quantized
+        if (const_tensor.quantized() && !has_quantize_op) {
+          std::unique_ptr<Tensor> dequantized_tensor(new Tensor(true));
+          dequantized_tensor->Resize(dims);
+          Tensor::MappingGuard quantize_guard(tensor.get());
+          Tensor::MappingGuard dequantize_guard(dequantized_tensor.get());
+          auto quantized_data = tensor->data<uint8_t>();
+          auto dequantized_data = dequantized_tensor->mutable_data<float>();
+          Dequantize(quantized_data,
+                     tensor->size(),
+                     tensor->scale(),
+                     tensor->zero_point(),
+                     dequantized_data);
+          tensor_map_[const_tensor.name()] = std::move(dequantized_tensor);
+        } else {
+          tensor_map_[const_tensor.name()] = std::move(tensor);
+        }
       }
       fused_buffer_ = true;
     }
diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h
index 28d45d3a6a80fc2e3af3b9b12cd71e38b23d474d..55b5e4b5517f773948786d406ff7c4d63f6a30f6 100644
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -44,8 +44,8 @@ struct GemmlowpOutputPipeline {
     bias_addition_stage.bias_vector = bias_vector;
     int32_t quantized_multiplier;
     int32_t right_shift;
-    kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
-                                         &quantized_multiplier, &right_shift);
+    GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
+                                &quantized_multiplier, &right_shift);
     gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
         quantize_down_stage;
     quantize_down_stage.result_offset_after_shift = output_zero_point;
@@ -62,8 +62,8 @@ struct GemmlowpOutputPipeline {
       const int32_t output_zero_point) {
     int32_t quantized_multiplier;
     int32_t right_shift;
-    kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
-                                         &quantized_multiplier, &right_shift);
+    GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
+                                &quantized_multiplier, &right_shift);
     gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
         quantize_down_stage;
     quantize_down_stage.result_offset_after_shift = output_zero_point;
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h
index c0280931b0643d1b44a872f96171f1e67d1f2799..337a831618a407de3fb443e67159347196b18e33 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -23,154 +23,11 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
+#include "mace/utils/quantize.h"
 
 namespace mace {
 namespace kernels {
 
-template<typename T>
-inline void AdjustRange(const float in_min_data,
-                        const float in_max_data,
-                        const bool non_zero,
-                        float *scale,
-                        int32_t *zero_point) {
-  // re-range to make range include zero float and
-  // make zero float as integer u8
-  const T quantized_min = std::numeric_limits<T>::lowest();
-  const T quantized_max = std::numeric_limits<T>::max();
-  if (quantized_min < 0) {
-    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
-  }
-
-  float out_max = std::max(0.f, in_max_data);
-  float out_min = std::min(0.f, in_min_data);
-  // make in_min_data quantize as greater than 1
-  if (non_zero) {
-    out_min = std::min(out_min,
-                       in_min_data - (out_max - in_min_data)
-                           / (quantized_max - quantized_min - 1));
-  }
-
-  *scale = (out_max - out_min) / (quantized_max - quantized_min);
-  const float kEps = 1e-6;
-  if (out_min < -kEps && out_max > kEps) {
-    float quantized_zero = -out_min / *scale;
-    int32_t
-        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
-    *zero_point = quantized_zero_near_int;
-    if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
-      if (quantized_zero < quantized_zero_near_int || non_zero) {
-        // keep out_max fixed, and move out_min
-        *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
-        *scale = out_max / (quantized_max - *zero_point);
-      } else {
-        // keep out_min fixed, and move out_max
-        *scale = out_min / (quantized_min - *zero_point);
-      }
-    }
-  } else if (out_min > -kEps) {
-    *zero_point = quantized_min;
-  } else {
-    *zero_point = quantized_max;
-  }
-}
-
-template<typename T>
-inline T Saturate(float value) {
-  int rounded_value = static_cast<int>(value);
-  if (rounded_value <= std::numeric_limits<T>::lowest()) {
-    return std::numeric_limits<T>::lowest();
-  } else if (rounded_value >= std::numeric_limits<T>::max()) {
-    return std::numeric_limits<T>::max();
-  } else {
-    return static_cast<T>(rounded_value);
-  }
-}
-
-inline void FindMinMax(const float *input,
-                       const index_t size,
-                       float *min_val, float *max_val) {
-  float max_v = std::numeric_limits<float>::lowest();
-  float min_v = std::numeric_limits<float>::max();
-  for (index_t i = 0; i < size; ++i) {
-    max_v = std::max(max_v, input[i]);
-    min_v = std::min(min_v, input[i]);
-  }
-  *min_val = min_v;
-  *max_val = max_v;
-}
-
-template<typename T>
-inline void QuantizeWithScaleAndZeropoint(const float *input,
-                                          const index_t size,
-                                          float scale,
-                                          int32_t zero_point,
-                                          T *output) {
-  float recip_scale = 1 / scale;
-#pragma omp parallel for
-  for (int i = 0; i < size; ++i) {
-    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
-  }
-}
-
-template<typename T>
-inline void Quantize(const float *input,
-                     const index_t size,
-                     bool non_zero,
-                     T *output,
-                     float *scale,
-                     int32_t *zero_point) {
-  float in_min_data;
-  float in_max_data;
-  FindMinMax(input, size, &in_min_data, &in_max_data);
-
-  AdjustRange<T>(in_min_data, in_max_data, non_zero,
-                 scale, zero_point);
-
-  QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
-}
-
-template<typename T>
-inline void Dequantize(const T *input,
-                       const index_t size,
-                       const float scale,
-                       const int32_t zero_point,
-                       float *output) {
-#pragma omp parallel for
-  for (int i = 0; i < size; ++i) {
-    output[i] = scale * (input[i] - zero_point);
-  }
-}
-
-inline void QuantizeMultiplier(double multiplier,
-                               int32_t* output_multiplier,
-                               int32_t* shift) {
-  if (multiplier == 0.f) {
-    *output_multiplier = 0;
-    *shift = 0;
-    return;
-  }
-  const double q = std::frexp(multiplier, shift);
-  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
-  if (qint == (1ll << 31)) {
-    qint /= 2;
-    ++*shift;
-  }
-  *output_multiplier = static_cast<int32_t>(qint);
-  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
-}
-
-inline void GetOutputMultiplierAndShift(
-    const float lhs_scale, const float rhs_scale, const float output_scale,
-    int32_t *quantized_multiplier, int *right_shift) {
-  float real_multiplier = lhs_scale * rhs_scale / output_scale;
-  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
-
-  int exponent;
-  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
-  *right_shift = -exponent;
-  MACE_CHECK(*right_shift >= 0);
-}
-
 template<DeviceType D, typename T>
 struct QuantizeFunctor;
 
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index dd338e275356fd1bfab3cc21b50fade2b77da46e..750d64ef2feba2ca855e7dfe7e4c067d213b35ae 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -1174,7 +1174,7 @@ void TestQuant(const index_t batch,
   Tensor *bias = net.GetTensor("Bias");
   auto bias_data = bias->data<float>();
   std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
       q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 6d6b84f1f79356d9f1eb6411fc564c912fca3e1d..7f7e1c17e7d25ef7c5813cdaf596f8dbb21ae6c1 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -459,7 +459,7 @@ void TestQuant(const index_t batch,
   Tensor *bias = net.GetTensor("Bias");
   auto bias_data = bias->data<float>();
   std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
       q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index cdeba2439a94e5987c9844c4482f57af78dbb14c..4bdc53f4cab924886e1d9df5c582bac26e2dfa7c 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -277,7 +277,7 @@ void QuantRandom(const index_t batch,
   Tensor *bias = net.GetTensor("Bias");
   auto bias_data = bias->data<float>();
   std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
       bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0,
       q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 9ec3d96e27a919eeef74cce716e6006a7e66498d..0bf4c7da2c8b4d77dd042bd2c0c7a30cf183a942 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -36,6 +36,7 @@ message ConstTensor {
   optional int64 data_size = 7;
   optional float scale = 8;
   optional int32 zero_point = 9;
+  optional bool quantized = 10 [default = false];
 
   optional uint32 node_id = 100;
 }
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 2302626d1ce83c7a718be6c974f03282d79db4a0..5fbab749ddcf0f163bad9f42ee3987c989712ce8 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1667,9 +1667,6 @@ class Transformer(base_converter.ConverterInterface):
 
     def quantize_tensor(self, tensor):
         """Assume biasadd has been already folded with convolution and fc"""
-        if not self._option.quantize:
-            return False
-
         if tensor.data_type == mace_pb2.DT_FLOAT:
             ops = self._consumers.get(tensor.name, None)
             if len(ops) == 1 and ops[0].type in [MaceOp.Conv2D.name,
@@ -1697,6 +1694,7 @@ class Transformer(base_converter.ConverterInterface):
             tensor.int32_data.extend(quantized_tensor.data)
             tensor.scale = quantized_tensor.scale
             tensor.zero_point = quantized_tensor.zero
+            tensor.quantized = True
             self._quantized_tensor.update([tensor.name])
 
         return False
diff --git a/mace/python/tools/tensor_source.jinja2 b/mace/python/tools/tensor_source.jinja2
index 66feee0ed7af780806e46ae37080f084cf8c0efb..9cb3805d519e56f53b70ac39fe7b81138fc2b0ae 100644
--- a/mace/python/tools/tensor_source.jinja2
+++ b/mace/python/tools/tensor_source.jinja2
@@ -34,6 +34,7 @@ void CreateTensor{{tensor_info.id}}(mace::ConstTensor *const_tensor) {
   const_tensor->set_node_id({{ tensor.node_id }});
   const_tensor->set_scale({{ tensor.scale }});
   const_tensor->set_zero_point({{ tensor.zero_point }});
+  const_tensor->set_quantized({{ tensor.quantized | lower}});
 }
 
 }  // namespace {{tag}}
diff --git a/mace/utils/BUILD b/mace/utils/BUILD
index c383eb87d1d56ae3ae0e7aec13517a43128ad66a..283efa490e0a54ed48df9a61a289e5b67bf503f8 100644
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -15,20 +15,17 @@ cc_library(
         "logging.cc",
         "string_util.cc",
     ],
-    hdrs = [
-        "env_time.h",
-        "logging.h",
-        "memory_logging.h",
-        "rwlock.h",
-        "string_util.h",
-        "timer.h",
-        "tuner.h",
-        "utils.h",
+    hdrs = glob([
+        "*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
     ],
     linkopts = if_android([
         "-llog",
     ]),
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
     deps = [
         "//mace/public",
     ],
@@ -40,7 +37,11 @@ cc_test(
     srcs = [
         "tuner_test.cc",
     ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
     linkopts = ["-ldl"] + if_android([
         "-pie",
         "-lm",  # Required by unordered_map
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfaaff1560925c6d1674958ea8f9ae55f4842dd6
--- /dev/null
+++ b/mace/utils/quantize.h
@@ -0,0 +1,169 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_UTILS_QUANTIZE_H_
+#define MACE_UTILS_QUANTIZE_H_
+
+#include <limits>
+#include <algorithm>
+
+namespace mace {
+
+template<typename T>
+inline void AdjustRange(const float in_min_data,
+                        const float in_max_data,
+                        const bool non_zero,
+                        float *scale,
+                        int32_t *zero_point) {
+  // re-range to make range include zero float and
+  // make zero float as integer u8
+  const T quantized_min = std::numeric_limits<T>::lowest();
+  const T quantized_max = std::numeric_limits<T>::max();
+  if (quantized_min < 0) {
+    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
+  }
+
+  float out_max = std::max(0.f, in_max_data);
+  float out_min = std::min(0.f, in_min_data);
+  // make in_min_data quantize as greater than 1
+  if (non_zero) {
+    out_min = std::min(out_min,
+                       in_min_data - (out_max - in_min_data)
+                           / (quantized_max - quantized_min - 1));
+  }
+
+  *scale = (out_max - out_min) / (quantized_max - quantized_min);
+  const float kEps = 1e-6;
+  if (out_min < -kEps && out_max > kEps) {
+    float quantized_zero = -out_min / *scale;
+    int32_t
+        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
+    *zero_point = quantized_zero_near_int;
+    if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
+      if (quantized_zero < quantized_zero_near_int || non_zero) {
+        // keep out_max fixed, and move out_min
+        *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
+        *scale = out_max / (quantized_max - *zero_point);
+      } else {
+        // keep out_min fixed, and move out_max
+        *scale = out_min / (quantized_min - *zero_point);
+      }
+    }
+  } else if (out_min > -kEps) {
+    *zero_point = quantized_min;
+  } else {
+    *zero_point = quantized_max;
+  }
+}
+
+template<typename T>
+inline T Saturate(float value) {
+  int rounded_value = static_cast<int>(value);
+  if (rounded_value <= std::numeric_limits<T>::lowest()) {
+    return std::numeric_limits<T>::lowest();
+  } else if (rounded_value >= std::numeric_limits<T>::max()) {
+    return std::numeric_limits<T>::max();
+  } else {
+    return static_cast<T>(rounded_value);
+  }
+}
+
+inline void FindMinMax(const float *input,
+                       const index_t size,
+                       float *min_val, float *max_val) {
+  float max_v = std::numeric_limits<float>::lowest();
+  float min_v = std::numeric_limits<float>::max();
+  for (index_t i = 0; i < size; ++i) {
+    max_v = std::max(max_v, input[i]);
+    min_v = std::min(min_v, input[i]);
+  }
+  *min_val = min_v;
+  *max_val = max_v;
+}
+
+template<typename T>
+inline void QuantizeWithScaleAndZeropoint(const float *input,
+                                          const index_t size,
+                                          float scale,
+                                          int32_t zero_point,
+                                          T *output) {
+  float recip_scale = 1 / scale;
+#pragma omp parallel for
+  for (int i = 0; i < size; ++i) {
+    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<typename T>
+inline void Quantize(const float *input,
+                     const index_t size,
+                     bool non_zero,
+                     T *output,
+                     float *scale,
+                     int32_t *zero_point) {
+  float in_min_data;
+  float in_max_data;
+  FindMinMax(input, size, &in_min_data, &in_max_data);
+
+  AdjustRange<T>(in_min_data, in_max_data, non_zero,
+                 scale, zero_point);
+
+  QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
+}
+
+template<typename T>
+inline void Dequantize(const T *input,
+                       const index_t size,
+                       const float scale,
+                       const int32_t zero_point,
+                       float *output) {
+#pragma omp parallel for
+  for (int i = 0; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+inline void QuantizeMultiplier(double multiplier,
+                               int32_t* output_multiplier,
+                               int32_t* shift) {
+  if (multiplier == 0.f) {
+    *output_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(multiplier, shift);
+  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
+  if (qint == (1ll << 31)) {
+    qint /= 2;
+    ++*shift;
+  }
+  *output_multiplier = static_cast<int32_t>(qint);
+  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
+}
+
+inline void GetOutputMultiplierAndShift(
+    const float lhs_scale, const float rhs_scale, const float output_scale,
+    int32_t *quantized_multiplier, int *right_shift) {
+  float real_multiplier = lhs_scale * rhs_scale / output_scale;
+  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
+
+  int exponent;
+  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
+  *right_shift = -exponent;
+  MACE_CHECK(*right_shift >= 0);
+}
+
+}  // namespace mace
+
+#endif  // MACE_UTILS_QUANTIZE_H_