diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 7f03ce12221a7e074e59a34cdb38f918b86ff51a..aa434203db3da0eeb6be83c3e0538b3b76551fbe 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -84,7 +84,7 @@ cc_library(
     ]),
     deps = [
         "//mace/core",
-        "@gtest//:gtest",
+        "@gtest",
     ],
 )
 
@@ -254,7 +254,7 @@ cc_library(
         ":arm_neon_kernels",
         ":ref_kernels",
         ":testing",
-        "@gtest//:gtest",
+        "@gtest",
     ],
     alwayslink = 1,
 )
@@ -289,7 +289,7 @@ cc_library(
         ":opencl_kernels",
         ":ref_kernels",
         ":testing",
-        "@gtest//:gtest",
+        "@gtest",
     ],
     alwayslink = 1,
 )
@@ -329,12 +329,12 @@ cc_library(
             "ops_registry.h",
             "ops_test_util.h",
             "fixpoint.h",
-            "gemmlowp_util.h",
+            "common/gemmlowp_util.h",
             "quantization_util.h",
         ],
     ) + if_quantize_enabled(glob([
         "fixpoint.h",
-        "gemmlowp_util.h",
+        "common/gemmlowp_util.h",
         "quantization_util.h",
     ])),
     copts = [
diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f987da81373282f769f660e5f10e7795413b3be4
--- /dev/null
+++ b/mace/ops/arm/q8/eltwise.cc
@@ -0,0 +1,157 @@
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/q8/eltwise.h"
+
+#include <arm_neon.h>
+#include <algorithm>
+
+#include "mace/ops/common/gemmlowp_util.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+MaceStatus Eltwise::Compute(const OpContext *context,
+                            const Tensor *input0,
+                            const Tensor *input1,
+                            Tensor *output) {
+  MACE_UNUSED(context);
+  MACE_CHECK(type_ == SUM || type_ == SUB,
+             "Quantized Elementwise only support SUM and SUB now.");
+
+  constexpr int left_shift = 20;
+  const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
+  const double adjusted_input0_scale = input0->scale() / doubled_scale;
+  const double adjusted_input1_scale = input1->scale() / doubled_scale;
+  const double adjusted_output_scale =
+      doubled_scale / ((1 << left_shift) * output->scale());
+
+  int32_t input0_multiplier;
+  int32_t input1_multiplier;
+  int32_t output_multiplier;
+  int32_t input0_shift;
+  int32_t input1_shift;
+  int32_t output_shift;
+  QuantizeMultiplier(adjusted_input0_scale,
+                     &input0_multiplier,
+                     &input0_shift);
+  QuantizeMultiplier(adjusted_input1_scale,
+                     &input1_multiplier,
+                     &input1_shift);
+  QuantizeMultiplier(adjusted_output_scale,
+                     &output_multiplier,
+                     &output_shift);
+
+  Tensor::MappingGuard input0_guard(input0);
+  Tensor::MappingGuard input1_guard(input1);
+  Tensor::MappingGuard output_guard(output);
+
+  auto input0_ptr = input0->data<uint8_t>();
+  auto input1_ptr = input1->data<uint8_t>();
+  auto output_ptr = output->mutable_data<uint8_t>();
+
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i <= output->size() - 8; i += 8) {
+    const auto input0_val = vld1_u8(input0_ptr + i);
+    const auto input1_val = vld1_u8(input1_ptr + i);
+    const auto input0_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input0_val));
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val));
+    const auto offset_input0 =
+        vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
+    const auto offset_input1 =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
+    auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
+    auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
+    auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
+    auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
+    input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
+    input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
+    input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
+    input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
+    const auto input0_shift_dup = vdupq_n_s32(input0_shift);
+    const auto input1_shift_dup = vdupq_n_s32(input1_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
+    int32x4_t res_low, res_high;
+    if (type_ == SUM) {
+      res_low = vaddq_s32(input0_low_s32, input1_low_s32);
+      res_high = vaddq_s32(input0_high_s32, input1_high_s32);
+    } else {
+      res_low = vsubq_s32(input0_low_s32, input1_low_s32);
+      res_high = vsubq_s32(input0_high_s32, input1_high_s32);
+    }
+    res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
+    res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
+    res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
+    res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
+    const auto res_low_s16 = vmovn_s32(res_low);
+    const auto res_high_s16 = vmovn_s32(res_high);
+    const auto output_val = vaddq_s16(vcombine_s16(res_low_s16,
+                                                   res_high_s16),
+                                      vdupq_n_s16(output->zero_point()));
+    vst1_u8(output_ptr + i, vqmovun_s16(output_val));
+  }
+
+  index_t handled_output_size = output->size() - output->size() % 8;
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = handled_output_size; i < output->size(); ++i) {
+    const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+    const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+    const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+    const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+    const int32_t multiplied_input0 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                        input0_multiplier),
+            -input0_shift);
+    const int32_t multiplied_input1 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                        input1_multiplier),
+            -input1_shift);
+
+    int32_t res;
+    if (type_ == SUM) {
+      res = multiplied_input0 + multiplied_input1;
+    } else {
+      res = multiplied_input0 - multiplied_input1;
+    }
+
+    const int32_t output_val =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(res,
+                                                        output_multiplier),
+            -output_shift) + output->zero_point();
+    output_ptr[i] = Saturate<uint8_t>(output_val);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..5223dc30c58c3b7a97059969133200c3465870c2
--- /dev/null
+++ b/mace/ops/arm/q8/eltwise.h
@@ -0,0 +1,48 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This implements matrix-vector multiplication described as
+// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
+
+#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
+#define MACE_OPS_ARM_Q8_ELTWISE_H_
+
+#include "mace/core/op_context.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/eltwise_type.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+
+class Eltwise {
+ public:
+  explicit Eltwise(const EltwiseType type) : type_(type) {}
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input0,
+                     const Tensor *input1,
+                     Tensor *output);
+
+ private:
+  EltwiseType type_;
+};
+
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_Q8_ELTWISE_H_
diff --git a/mace/ops/common/eltwise_type.h b/mace/ops/common/eltwise_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..634c4919c18f221b255939a01d8411428b8f3476
--- /dev/null
+++ b/mace/ops/common/eltwise_type.h
@@ -0,0 +1,40 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_
+#define MACE_OPS_COMMON_ELTWISE_TYPE_H_
+
+namespace mace {
+namespace ops {
+
+enum EltwiseType {
+  SUM = 0,
+  SUB = 1,
+  PROD = 2,
+  DIV = 3,
+  MIN = 4,
+  MAX = 5,
+  NEG = 6,
+  ABS = 7,
+  SQR_DIFF = 8,
+  POW = 9,
+  EQUAL = 10,
+  FLOOR_DIV = 11,
+  NONE = 12,
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_COMMON_ELTWISE_TYPE_H_
diff --git a/mace/ops/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h
similarity index 96%
rename from mace/ops/gemmlowp_util.h
rename to mace/ops/common/gemmlowp_util.h
index c7091544ef5d90ef5fa11cbaacb052744dbe0ef0..c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500 100644
--- a/mace/ops/gemmlowp_util.h
+++ b/mace/ops/common/gemmlowp_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_GEMMLOWP_UTIL_H_
-#define MACE_OPS_GEMMLOWP_UTIL_H_
+#ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
+#define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
 
 #include <tuple>
 
@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
 };
 }  // namespace mace
 
-#endif  // MACE_OPS_GEMMLOWP_UTIL_H_
+#endif  // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 19794b38be56fe3a99deb0583b0967575de571ae..653e3e33f535915eb52baad01e4e14f3b7a80bb7 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -41,7 +41,7 @@
 #endif  // MACE_ENABLE_NEON
 
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/quantization_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index f035eeee579907fea2ddb77d04ca5c982c903b67..768349fffd268d58aee8f05260f37c841021947e 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -12,6 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef MACE_ENABLE_NEON
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/arm/q8/eltwise.h"
+#endif  // MACE_ENABLE_QUANTIZE
+#endif  // MACE_ENABLE_NEON
+
 #include "mace/ops/eltwise.h"
 
 #include <algorithm>
@@ -1035,19 +1041,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1)),
         data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0))) {}
+            "data_format", 0)))
+#ifdef MACE_ENABLE_NEON
+        , eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE))))
+#endif
+  {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
     const Tensor *input0 = this->Input(0);
-    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    MACE_CHECK(this->InputSize() == 2,
+               "Quantized Elementwise don't support broadcast now.");
+    const Tensor *input1 = this->Input(1);
     Tensor *output = this->Output(0);
-    MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
+    MACE_CHECK(type_ == SUM || type_ == SUB,
+               "Quantized Elementwise only support SUM and SUB now.");
     MACE_CHECK(input0->size() == input1->size(),
                "input0 and input1 must have the same shape.");
     MACE_CHECK(output->scale() != 0);
     MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
 
+#ifdef MACE_ENABLE_NEON
+    eltwise_.Compute(context, input0, input1, output);
+#else
     constexpr int left_shift = 20;
     const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
     const double adjusted_input0_scale = input0->scale() / doubled_scale;
@@ -1078,57 +1095,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
     auto input0_ptr = input0->data<uint8_t>();
     auto input1_ptr = input1->data<uint8_t>();
     auto output_ptr = output->mutable_data<uint8_t>();
-
-    index_t handled_output_size = 0;
-#ifdef MACE_ENABLE_NEON
-    #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
-      const auto input0_val = vld1_u8(input0_ptr + i);
-      const auto input1_val = vld1_u8(input1_ptr + i);
-      const auto input0_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input0_val));
-      const auto input1_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input1_val));
-      const auto offset_input0 =
-          vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
-      const auto offset_input1 =
-          vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
-      auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
-      auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
-      auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
-      auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
-      const auto left_shift_dup = vdupq_n_s32(left_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
-      input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
-      input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
-      input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
-      input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
-      const auto input0_shift_dup = vdupq_n_s32(input0_shift);
-      const auto input1_shift_dup = vdupq_n_s32(input1_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
-      auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
-      auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
-      sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
-      sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
-      sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
-      sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
-      const auto sum_low_s16 = vmovn_s32(sum_low);
-      const auto sum_high_s16 = vmovn_s32(sum_high);
-      const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
-                                                     sum_high_s16),
-                                        vdupq_n_s16(output->zero_point()));
-      vst1_u8(output_ptr + i, vqmovun_s16(output_val));
-    }
-    handled_output_size = output->size() - output->size() % 8;
-#endif  // NEON
 #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i < output->size(); ++i) {
+    for (index_t i = 0; i < output->size(); ++i) {
       const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
       const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
       const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
@@ -1143,14 +1111,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
               gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
                                                           input1_multiplier),
               -input1_shift);
-      const int32_t sum = multiplied_input0 + multiplied_input1;
+
+      int32_t res;
+      if (type_ == SUM) {
+        res = multiplied_input0 + multiplied_input1;
+      } else {
+        res = multiplied_input0 - multiplied_input1;
+      }
+
       const int32_t output_val =
           gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(sum,
+              gemmlowp::SaturatingRoundingDoublingHighMul(res,
                                                           output_multiplier),
               -output_shift) + output->zero_point();
       output_ptr[i] = Saturate<uint8_t>(output_val);
     }
+#endif  // NEON
 
     return MaceStatus::MACE_SUCCESS;
   }
@@ -1162,6 +1138,9 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
   int32_t scalar_input_index_;
   DataFormat data_format_;
   Tensor scalar_tensor_;
+#ifdef MACE_ENABLE_NEON
+  arm::q8::Eltwise eltwise_;
+#endif
 };
 #endif  // MACE_ENABLE_QUANTIZE
 
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index c79c6c27abfb3cef4ed02abfacc3dea5384e1bd3..208d7f26549b6642502dcf6022983ad4f0f52622 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -15,25 +15,11 @@
 #ifndef MACE_OPS_ELTWISE_H_
 #define MACE_OPS_ELTWISE_H_
 
+#include "mace/ops/common/eltwise_type.h"
+
 namespace mace {
 namespace ops {
 
-enum EltwiseType {
-  SUM = 0,
-  SUB = 1,
-  PROD = 2,
-  DIV = 3,
-  MIN = 4,
-  MAX = 5,
-  NEG = 6,
-  ABS = 7,
-  SQR_DIFF = 8,
-  POW = 9,
-  EQUAL = 10,
-  FLOOR_DIV = 11,
-  NONE = 12,
-};
-
 inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
 
 }  // namespace ops
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 0bfb666f70d3fd606703e32bcd3a4baf3f788fa6..cb239f53e0c01d79dc718d5b3d4eca636b187863 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -30,12 +30,12 @@ void EltwiseBenchmark(
 
   OpsTestNet net;
   // Add input data
-  if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
-    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-  } else {
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value != DT_UINT8) {
     net.AddRandomInput<D, T>("Input0", {n, c, h, w});
     net.AddRandomInput<D, T>("Input1", {n, c, h, w});
+  } else {
+    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
+    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
   }
 
   OpDefBuilder("Eltwise", "EltwiseTest")
@@ -47,15 +47,21 @@ void EltwiseBenchmark(
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(D);
+
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value == DT_UINT8) {
+    net.GetTensor("Output")->SetScale(0.1);
+  }
+
   // Warm-up
   for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
     net.Sync();
   }
 
   mace::testing::StartTiming();
   while (iters--) {
-    net.RunOp(D);
+    net.Run();
     net.Sync();
   }
 }
@@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256);
 MACE_BM_ELTWISE(5, 1, 128, 128, 32);
 MACE_BM_ELTWISE(5, 1, 240, 240, 256);
 
+MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU);
+MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 7ca799e2e8701b8adb439218c17ce10d8fbd0f56..4f18810e73213c6113cca236d39e215731435983 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
   }
 }
 
-void QuantizedSum(const std::vector<index_t> &shape) {
+void Quantized(const std::vector<index_t> &shape,
+               const ops::EltwiseType type) {
   // Construct graph
   OpsTestNet net;
 
@@ -753,7 +754,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput0")
       .Input("TInput1")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(type))
       .AddIntArg("data_format", DataFormat::NCHW)
       .Output("TOutput")
       .Finalize(net.NewOperatorDef());
@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
       .Input("QuantizedInput0")
       .Input("QuantizedInput1")
       .Output("QuantizedOutput")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(type))
       .AddIntArg("T", static_cast<int>(DT_UINT8))
       .Finalize(net.NewOperatorDef());
   net.Setup(DeviceType::CPU);
@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
       {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36});
 }
 
-TEST_F(EltwiseOpTest, QuantizedSum) {
-  QuantizedSum({1, 32, 32, 16});
-  QuantizedSum({1, 31, 31, 17});
+TEST_F(EltwiseOpTest, Quantized) {
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM);
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM);
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB);
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB);
 }
 
 }  // namespace test
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index a3aebcb49abe323a24bc792f857577481be19f35..07a65d79459166281a6f13cb4d58817a69d0f3ac 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -38,7 +38,7 @@
 #endif  // MACE_ENABLE_NEON
 
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 54f3e55bbaf07d04026ed28de0ed361bd9ff2061..b407ac34c357a6e81295007ee946ff61e0c18b7b 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -22,7 +22,7 @@
 
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/fixpoint.h"
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 33d4633635528b94a3d8d0ed108398368572a36c..f93d1819fb0ef025e6f034032c16795d878b3591 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface):
             else:
                 mace_check(op.type == MaceOp.Quantize.name,
                            "Quantization only support float ops, "
-                           "but get %s(%s)"
-                           % (op.name, op.type))
+                           "but get %s(%s, %s)"
+                           % (op.name, op.type,
+                              mace_pb2.DataType.Name(data_type_arg.i)))
 
         for input_node in self._option.input_nodes.values():
             new_input_name = self.input_name_map[input_node.name]
@@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface):
                     self.add_quantize_info(op, 0.0, 1.0)
                 self._quantize_activation_info[op.output[0]] = quantize_info
             elif (op.type == MaceOp.Eltwise.name
-                  and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value  # noqa
                   and not op.quantize_info
                   and len(op.input) == 2
                   and len(op.input[0]) not in self._consts
                   and len(op.input[1]) not in self._consts):
-                del op.quantize_info[:]
                 producer_op0 = self._producer[op.input[0]]
                 producer_op1 = self._producer[op.input[1]]
-                minval = producer_op0.quantize_info[0].minval \
-                    + producer_op1.quantize_info[0].minval
-                maxval = producer_op0.quantize_info[0].maxval \
-                    + producer_op1.quantize_info[0].maxval
+                if ConverterUtil.get_arg(
+                        op, MaceKeyword.mace_element_type_str).i \
+                        == EltwiseType.SUM.value:
+                    minval = producer_op0.quantize_info[0].minval \
+                        + producer_op1.quantize_info[0].minval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        + producer_op1.quantize_info[0].maxval
+                elif ConverterUtil.get_arg(
+                        op, MaceKeyword.mace_element_type_str).i \
+                        == EltwiseType.SUB.value:
+                    minval = producer_op0.quantize_info[0].minval \
+                        - producer_op1.quantize_info[0].maxval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        - producer_op1.quantize_info[0].minval
+                else:
+                    mace_check(False, "Quantized Elementwise only support:"
+                                      " SUM and SUB now.")
                 quantize_info = \
                     self.add_quantize_info(op, minval, maxval)
                 self._quantize_activation_info[op.output[0]] = quantize_info
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
index 81d820cbfc39b2fe9edb729071d351c1993b1b01..ae8551c15f47b112c469085b868af2694ee7b452 100644
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -19,6 +19,8 @@
 #include <cmath>
 #include <limits>
 
+#include "mace/utils/logging.h"
+
 namespace mace {
 
 template<typename T>
@@ -138,11 +140,6 @@ inline void Dequantize(const T *input,
 inline void QuantizeMultiplier(double multiplier,
                                int32_t* output_multiplier,
                                int32_t* shift) {
-  if (multiplier == 0.f) {
-    *output_multiplier = 0;
-    *shift = 0;
-    return;
-  }
   const double q = std::frexp(multiplier, shift);
   auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
   if (qint == (1ll << 31)) {