separate quantize code when run without quantize

e35a077a · yejianwu · e988e951 · e35a077a · e35a077a · e35a077a
25 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -59,3 +59,11 @@ config_setting(
    },
    visibility = ["//visibility:public"],
 )
+
+config_setting(
+    name = "quantize_enabled",
+    define_values = {
+        "quantize": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -15,6 +15,7 @@ load(
    "if_openmp_enabled",
    "if_neon_enabled",
    "if_opencl_enabled",
+    "if_quantize_enabled",
 )

 cc_library(
@@ -51,6 +52,8 @@ cc_library(
        "-DMACE_ENABLE_OPENMP",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]) + if_neon_enabled([
@@ -64,11 +67,12 @@ cc_library(
        "//mace/codegen:generated_version",
        "//mace/proto:mace_cc",
        "//mace/utils",
-        "@gemmlowp",
    ] + if_opencl_enabled([
        ":opencl_headers",
        "//mace/codegen:generated_opencl",
        "@half//:half",
+    ]) + if_quantize_enabled([
+        "@gemmlowp",
    ]) + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
    ]),

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -176,11 +176,16 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
    int omp_num_threads_hint,
    CPUAffinityPolicy policy,
-    gemmlowp::GemmContext *gemm_context) {
+    void *gemm_context) {
  if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
+#ifdef MACE_ENABLE_QUANTIZE
    if (gemm_context) {
-      gemm_context->set_max_num_threads(std::max(0, omp_num_threads_hint));
+      static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
+          std::max(0, omp_num_threads_hint));
    }
+#else
+    MACE_UNUSED(gemm_context);
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENMP
    if (omp_num_threads_hint > 0) {
      omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
@@ -210,9 +215,12 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
    omp_num_threads_hint = use_cpu_ids.size();
  }

+#ifdef MACE_ENABLE_QUANTIZE
  if (gemm_context) {
-    gemm_context->set_max_num_threads(omp_num_threads_hint);
+    static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
+        omp_num_threads_hint);
  }
+#endif  // MACE_ENABLE_QUANTIZE

  return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
 }

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -18,7 +18,11 @@
 #include <memory>
 #include <vector>

+#ifdef MACE_ENABLE_QUANTIZE
 #include "public/gemmlowp.h"
+#endif  // MACE_ENABLE_QUANTIZE
+
+#include "mace/core/macros.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"

@@ -34,22 +38,34 @@ class CPURuntime {
      : num_threads_(num_threads),
        policy_(policy),
        gemm_context_(nullptr) {
+#ifdef MACE_ENABLE_QUANTIZE
    if (use_gemmlowp) {
      MACE_CHECK_NOTNULL(GetGemmlowpContext());
    }
-
+#else
+    MACE_UNUSED(use_gemmlowp);
+#endif  // MACE_ENABLE_QUANTIZE
    SetOpenMPThreadsAndAffinityPolicy(num_threads_,
                                      policy_,
-                                      gemm_context_.get());
+                                      gemm_context_);
+  }
+
+#ifdef MACE_ENABLE_QUANTIZE
+  ~CPURuntime() {
+    if (!gemm_context_) {
+      delete static_cast<gemmlowp::GemmContext*>(gemm_context_);
+    }
  }
-  ~CPURuntime() = default;

  gemmlowp::GemmContext *GetGemmlowpContext() {
    if (!gemm_context_) {
-      gemm_context_.reset(new gemmlowp::GemmContext());
+      gemm_context_ = new gemmlowp::GemmContext();
    }
-    return gemm_context_.get();
+    return static_cast<gemmlowp::GemmContext*>(gemm_context_);
  }
+#else
+  ~CPURuntime() = default;
+#endif  // MACE_ENABLE_QUANTIZE

  int num_threads() const {
    return num_threads_;
@@ -67,11 +83,11 @@ class CPURuntime {
  MaceStatus SetOpenMPThreadsAndAffinityPolicy(
      int omp_num_threads_hint,
      CPUAffinityPolicy policy,
-      gemmlowp::GemmContext *gemm_context);
+      void *gemm_context);

  int num_threads_;
  CPUAffinityPolicy policy_;
-  std::unique_ptr<gemmlowp::GemmContext> gemm_context_;
+  void *gemm_context_;
 };
 }  // namespace mace


--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -16,6 +16,7 @@ load(
    "if_hexagon_enabled",
    "if_opencl_enabled",
    "if_opencl_enabled_str",
+    "if_quantize_enabled",
 )

 cc_library(
@@ -34,6 +35,8 @@ cc_library(
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -60,6 +60,12 @@ def if_opencl_enabled_str(a):
      "//conditions:default": "",
  })

+def if_quantize_enabled(a):
+  return select({
+      "//mace:quantize_enabled": a,
+      "//conditions:default": [],
+  })
+
 def mace_version_genrule():
  native.genrule(
      name = "mace_version_gen",

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -14,6 +14,7 @@ load(
    "if_android_armv7",
    "if_hexagon_enabled",
    "if_opencl_enabled",
+    "if_quantize_enabled",
 )

 cc_library(
@@ -33,6 +34,7 @@ cc_library(
            "buffer_transform.cc",
            "lstm_cell.cc",
            "winograd_transform.cc",
+            "quantize.cc",
        ],
    ) + if_opencl_enabled(glob(
        [
@@ -47,6 +49,10 @@ cc_library(
        exclude = [
            "opencl/*_test.cc",
        ],
+    )) + if_quantize_enabled(glob(
+        [
+            "quantize.cc",
+        ],
    )),
    hdrs = glob(
        [
@@ -56,11 +62,16 @@ cc_library(
        exclude = [
            "ops_registry.h",
            "ops_test_util.h",
+            "fixpoint.h",
+            "gemmlowp_util.h",
        ]
    ) + if_opencl_enabled(glob([
        "opencl/*.h",
        "opencl/image/*.h",
        "opencl/buffer/*.h",
+    ])) + if_quantize_enabled(glob([
+        "fixpoint.h",
+        "gemmlowp_util.h",
    ])),
    copts = [
        "-Werror",
@@ -76,15 +87,18 @@ cc_library(
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",
-        "@gemmlowp",
        "@tflite",
-    ],
+    ] + if_quantize_enabled([
+        "@gemmlowp",
+    ]),
 )


@@ -110,6 +124,8 @@ cc_library(
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),
@@ -157,7 +173,15 @@ cc_test(
            "arm/*_test.cc",
            "opencl/*_test.cc",
        ],
-    ),
+        exclude = [
+            "fixpoint_test.cc"
+        ],
+    ) + if_quantize_enabled(glob(
+        [
+            "fixpoint_test.cc"
+        ],
+
+    )),
    copts = [
        "-Werror",
        "-Wextra",
@@ -171,6 +195,8 @@ cc_test(
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),
@@ -199,6 +225,8 @@ cc_test(
        "-mfloat-abi=softfp",
    ]) + if_opencl_enabled([
        "-DMACE_ENABLE_OPENCL",
+    ]) + if_quantize_enabled([
+        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
    ]),
@@ -208,6 +236,5 @@ cc_test(
        "test",
        "//mace/core:test_benchmark_main",
        "//third_party/eigen3",
-        "@gemmlowp",
    ],
 )
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -108,6 +108,7 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
  }
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
 public:
@@ -177,6 +178,7 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
    return MaceStatus::MACE_SUCCESS;
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -209,8 +211,10 @@ void RegisterConcat(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, int32_t);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -31,8 +31,12 @@
 #include "mace/ops/arm/conv_winograd.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/gemmlowp_util.h"
 #include "mace/utils/utils.h"
+
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/gemmlowp_util.h"
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/conv_2d.h"
 #include "mace/ops/opencl/buffer/conv_2d.h"
@@ -707,6 +711,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 };


+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 public:
@@ -943,6 +948,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -987,8 +993,10 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                   DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -20,9 +20,11 @@
 #include <string>
 #include <vector>

+#ifdef MACE_ENABLE_QUANTIZE
 // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
 // using OpenMP for MACE's quantized depthwise_conv2d.
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#endif  // MACE_ENABLE_QUANTIZE

 #include "mace/core/future.h"
 #include "mace/core/operator.h"
@@ -276,6 +278,7 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
    : public DepthwiseConv2dOpBase {
@@ -479,6 +482,7 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -520,8 +524,10 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
                   DepthwiseConv2dOp, DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
                   DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -932,6 +932,7 @@ class EltwiseOp : public Operation {
  Tensor scalar_tensor_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 public:
@@ -1072,6 +1073,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
  DataFormat data_format_;
  Tensor scalar_tensor_;
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -1113,8 +1115,11 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
                   DeviceType::CPU, int32_t);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
                   DeviceType::GPU, float);

--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -63,8 +63,10 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                   DeviceType::CPU, int32_t);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
 }

 }  // namespace ops

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -21,7 +21,10 @@
 #include "mace/core/tensor.h"
 #include "mace/ops/activation.h"
 #include "mace/ops/gemm.h"
+
+#ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/gemmlowp_util.h"
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/fully_connected.h"
@@ -106,6 +109,7 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
  }
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class FullyConnectedOp<DeviceType::CPU, uint8_t>
    : public FullyConnectedOpBase {
@@ -180,6 +184,7 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
    return MaceStatus::MACE_SUCCESS;
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -218,8 +223,11 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "FullyConnected",
                   FullyConnectedOp, DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "FullyConnected",
                   FullyConnectedOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "FullyConnected",
                   FullyConnectedOp, DeviceType::GPU, float);

--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
@@ -89,8 +89,11 @@ class GatherOp : public Operation {
 void RegisterGather(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
 }

 }  // namespace ops

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -22,9 +22,13 @@
 #include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/gemm.h"
-#include "mace/ops/gemmlowp_util.h"
 #include "mace/ops/sgemm.h"
 #include "mace/utils/utils.h"
+
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/gemmlowp_util.h"
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -150,6 +154,7 @@ class MatMulOp<CPU, float> : public MatMulOpBase {
  SGemm sgemm_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder,
    typename OutputType>
 class MatMulFixpointImpl;
@@ -311,6 +316,7 @@ class MatMulOp<DeviceType::CPU, uint8_t>: public MatMulOpBase {
    return MaceStatus::MACE_SUCCESS;
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -342,8 +348,11 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::GPU, float);

--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
@@ -33,7 +33,6 @@ extern void RegisterDeconv2D(OpRegistryBase *op_registry);
 extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
 extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
 extern void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry);
-extern void RegisterDequantize(OpRegistryBase *op_registry);
 extern void RegisterEltwise(OpRegistryBase *op_registry);
 extern void RegisterExpandDims(OpRegistryBase *op_registry);
 extern void RegisterFill(OpRegistryBase *op_registry);
@@ -45,7 +44,6 @@ extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
 extern void RegisterMatMul(OpRegistryBase *op_registry);
 extern void RegisterPad(OpRegistryBase *op_registry);
 extern void RegisterPooling(OpRegistryBase *op_registry);
-extern void RegisterQuantize(OpRegistryBase *op_registry);
 extern void RegisterReduceMean(OpRegistryBase *op_registry);
 extern void RegisterReshape(OpRegistryBase *op_registry);
 extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
@@ -64,6 +62,11 @@ extern void RegisterStridedSlice(OpRegistryBase *op_registry);
 extern void RegisterTranspose(OpRegistryBase *op_registry);
 extern void RegisterUnstack(OpRegistryBase *op_registry);

+#ifdef MACE_ENABLE_QUANTIZE
+extern void RegisterDequantize(OpRegistryBase *op_registry);
+extern void RegisterQuantize(OpRegistryBase *op_registry);
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 extern void RegisterBufferTransform(OpRegistryBase *op_registry);
 extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry);
@@ -91,7 +94,6 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterDepthToSpace(this);
  ops::RegisterDepthwiseConv2d(this);
  ops::RegisterDepthwiseDeconv2d(this);
-  ops::RegisterDequantize(this);
  ops::RegisterEltwise(this);
  ops::RegisterExpandDims(this);
  ops::RegisterFill(this);
@@ -103,7 +105,6 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterMatMul(this);
  ops::RegisterPad(this);
  ops::RegisterPooling(this);
-  ops::RegisterQuantize(this);
  ops::RegisterReduceMean(this);
  ops::RegisterReshape(this);
  ops::RegisterResizeBicubic(this);
@@ -122,6 +123,11 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterTranspose(this);
  ops::RegisterUnstack(this);

+#ifdef MACE_ENABLE_QUANTIZE
+  ops::RegisterDequantize(this);
+  ops::RegisterQuantize(this);
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
  ops::RegisterBufferTransform(this);
  ops::RegisterBufferInverseTransform(this);

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -214,6 +214,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
  }
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
 public:
@@ -420,6 +421,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
    }
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -451,8 +453,11 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
 void RegisterPooling(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -244,6 +244,7 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
  std::vector<index_t> size_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
 public:
@@ -317,6 +318,7 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
  bool align_corners_;
  std::vector<index_t> size_;
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -355,8 +357,10 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
                   DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -18,8 +18,12 @@
 #include <vector>

 #include "mace/core/operator.h"
+
+#ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/fixpoint.h"
 #include "mace/ops/gemmlowp_util.h"
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/softmax.h"
 #include "mace/ops/opencl/buffer/softmax.h"
@@ -122,6 +126,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
 static const int kInputDeltaIntBits = 6;
 static const int kSumExpIntBits = 12;

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
 public:
@@ -351,6 +356,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
    return MaceStatus::MACE_SUCCESS;
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -382,8 +388,10 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
                   DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -197,6 +197,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
  }
 };

+#ifdef MACE_ENABLE_QUANTIZE
 template <>
 class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
 public:
@@ -299,6 +300,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
    return MaceStatus::MACE_SUCCESS;
  }
 };
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
@@ -331,8 +333,10 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
                   SpaceToBatchNDOp, DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
                   SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",

--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -52,7 +52,9 @@ class SqueezeOp : public Operation {

 void RegisterSqueeze(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float);
+#ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -223,7 +223,7 @@ class MACE_API MaceEngineConfig {
  /// \param status MACE_SUCCESS for successful, or it can't reliabley
  /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
  /// suggested to use AFFINITY_NONE to use all cores.
-  /// \param use_gemmlowp use gemmlowp for quantized inference
+  /// \param use_gemmlowp use gemmlowp for cpu quantized inference
  /// \return MaceStatus::MACE_SUCCESS for success, other for failed.
  MaceStatus SetCPUThreadPolicy(int num_threads_hint,
                                CPUAffinityPolicy policy,

--- a/tools/build-standalone-lib.sh
+++ b/tools/build-standalone-lib.sh
@@ -24,41 +24,41 @@ mkdir -p $LIB_DIR/linux-x86-64

 # build shared libraries
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/

 echo "build shared lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu/

 echo "build shared lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/

 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build shared lib for linux-x86-64"
-	bazel build mace/libmace:libmace_dynamic --config optimization --define openmp=true
+	bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=true
 	cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/linux-x86-64/
 fi

 # build static libraries
 echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/

 echo "build static lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu/

 echo "build static lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/

 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build static lib for linux-x86-64"
-	bazel build mace/libmace:libmace_static --config optimization --define openmp=true
+	bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=true
 	cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/linux-x86-64/
 fi


--- a/tools/converter.py
+++ b/tools/converter.py
@@ -279,6 +279,17 @@ def get_opencl_mode(configs):
    return False


+def get_quantize_mode(configs):
+    for model_name in configs[YAMLKeyword.models]:
+        quantize =\
+            configs[YAMLKeyword.models][model_name].get(
+                YAMLKeyword.quantize, 0)
+        if quantize == 1:
+            return True
+
+    return False
+
+
 def md5sum(str):
    md5 = hashlib.md5()
    md5.update(str.encode('utf-8'))
@@ -855,6 +866,7 @@ def build_model_lib(configs, address_sanitizer):
            abi=target_abi,
            hexagon_mode=hexagon_mode,
            enable_opencl=get_opencl_mode(configs),
+            enable_quantize=get_quantize_mode(configs),
            address_sanitizer=address_sanitizer,
            symbol_hidden=True
        )
@@ -968,6 +980,7 @@ def build_mace_run(configs, target_abi, enable_openmp, address_sanitizer,
        hexagon_mode=hexagon_mode,
        enable_openmp=enable_openmp,
        enable_opencl=get_opencl_mode(configs),
+        enable_quantize=get_quantize_mode(configs),
        address_sanitizer=address_sanitizer,
        symbol_hidden=symbol_hidden,
        extra_args=build_arg
@@ -996,6 +1009,7 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
                            abi=target_abi,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
+                            enable_quantize=get_quantize_mode(configs),
                            hexagon_mode=hexagon_mode,
                            address_sanitizer=address_sanitizer,
                            symbol_hidden=symbol_hidden)
@@ -1025,6 +1039,7 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
                            abi=target_abi,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
+                            enable_quantize=get_quantize_mode(configs),
                            hexagon_mode=hexagon_mode,
                            address_sanitizer=address_sanitizer,
                            extra_args=build_arg)
@@ -1404,6 +1419,7 @@ def build_benchmark_model(configs, target_abi, enable_openmp, mace_lib_type):
                            abi=target_abi,
                            enable_openmp=enable_openmp,
                            enable_opencl=get_opencl_mode(configs),
+                            enable_quantize=get_quantize_mode(configs),
                            hexagon_mode=hexagon_mode,
                            symbol_hidden=symbol_hidden,
                            extra_args=build_arg)

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -342,6 +342,7 @@ def bazel_build(target,
                enable_openmp=True,
                enable_neon=True,
                enable_opencl=True,
+                enable_quantize=True,
                address_sanitizer=False,
                symbol_hidden=True,
                extra_args=""):
@@ -351,6 +352,8 @@ def bazel_build(target,
            "build",
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
+            "--define",
+            "quantize=%s" % str(enable_quantize).lower(),
            target,
        )
    else:
@@ -367,6 +370,8 @@ def bazel_build(target,
            "--define",
            "opencl=%s" % str(enable_opencl).lower(),
            "--define",
+            "quantize=%s" % str(enable_quantize).lower(),
+            "--define",
            "hexagon=%s" % str(hexagon_mode).lower())
    if address_sanitizer:
        bazel_args += ("--config", "asan")