Add quantized eltwise sub

d4f1368d · Bin Li · f2f05c0d · d4f1368d · d4f1368d · d4f1368d
14 changed file
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -84,7 +84,7 @@ cc_library(
    ]),
    deps = [
        "//mace/core",
-        "@gtest//:gtest",
+        "@gtest",
    ],
 )
@@ -254,7 +254,7 @@ cc_library(
        ":arm_neon_kernels",
        ":ref_kernels",
        ":testing",
-        "@gtest//:gtest",
+        "@gtest",
    ],
    alwayslink = 1,
 )
@@ -289,7 +289,7 @@ cc_library(
        ":opencl_kernels",
        ":ref_kernels",
        ":testing",
-        "@gtest//:gtest",
+        "@gtest",
    ],
    alwayslink = 1,
 )
@@ -329,12 +329,12 @@ cc_library(
            "ops_registry.h",
            "ops_test_util.h",
            "fixpoint.h",
-            "gemmlowp_util.h",
+            "common/gemmlowp_util.h",
            "quantization_util.h",
        ],
    ) + if_quantize_enabled(glob([
        "fixpoint.h",
-        "gemmlowp_util.h",
+        "common/gemmlowp_util.h",
        "quantization_util.h",
    ])),
    copts = [

--- a/mace/ops/arm/q8/eltwise.cc
+++ b/mace/ops/arm/q8/eltwise.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/q8/eltwise.h"
+#include <arm_neon.h>
+#include <algorithm>
+#include "mace/ops/common/gemmlowp_util.h"
+#include "mace/utils/logging.h"
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+MaceStatus Eltwise::Compute(const OpContext *context,
+                            const Tensor *input0,
+                            const Tensor *input1,
+                            Tensor *output) {
+  MACE_UNUSED(context);
+  MACE_CHECK(type_ == SUM || type_ == SUB,
+             "Quantized Elementwise only support SUM and SUB now.");
+  constexpr int left_shift = 20;
+  const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
+  const double adjusted_input0_scale = input0->scale() / doubled_scale;
+  const double adjusted_input1_scale = input1->scale() / doubled_scale;
+  const double adjusted_output_scale =
+      doubled_scale / ((1 << left_shift) * output->scale());
+  int32_t input0_multiplier;
+  int32_t input1_multiplier;
+  int32_t output_multiplier;
+  int32_t input0_shift;
+  int32_t input1_shift;
+  int32_t output_shift;
+  QuantizeMultiplier(adjusted_input0_scale,
+                     &input0_multiplier,
+                     &input0_shift);
+  QuantizeMultiplier(adjusted_input1_scale,
+                     &input1_multiplier,
+                     &input1_shift);
+  QuantizeMultiplier(adjusted_output_scale,
+                     &output_multiplier,
+                     &output_shift);
+  Tensor::MappingGuard input0_guard(input0);
+  Tensor::MappingGuard input1_guard(input1);
+  Tensor::MappingGuard output_guard(output);
+  auto input0_ptr = input0->data<uint8_t>();
+  auto input1_ptr = input1->data<uint8_t>();
+  auto output_ptr = output->mutable_data<uint8_t>();
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = 0; i <= output->size() - 8; i += 8) {
+    const auto input0_val = vld1_u8(input0_ptr + i);
+    const auto input1_val = vld1_u8(input1_ptr + i);
+    const auto input0_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input0_val));
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val));
+    const auto offset_input0 =
+        vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
+    const auto offset_input1 =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
+    auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
+    auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
+    auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
+    auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
+    input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
+    input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
+    input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
+    input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
+    const auto input0_shift_dup = vdupq_n_s32(input0_shift);
+    const auto input1_shift_dup = vdupq_n_s32(input1_shift);
+    input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
+    input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
+    input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
+    input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
+    int32x4_t res_low, res_high;
+    if (type_ == SUM) {
+      res_low = vaddq_s32(input0_low_s32, input1_low_s32);
+      res_high = vaddq_s32(input0_high_s32, input1_high_s32);
+    } else {
+      res_low = vsubq_s32(input0_low_s32, input1_low_s32);
+      res_high = vsubq_s32(input0_high_s32, input1_high_s32);
+    }
+    res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
+    res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
+    res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
+    res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
+    const auto res_low_s16 = vmovn_s32(res_low);
+    const auto res_high_s16 = vmovn_s32(res_high);
+    const auto output_val = vaddq_s16(vcombine_s16(res_low_s16,
+                                                   res_high_s16),
+                                      vdupq_n_s16(output->zero_point()));
+    vst1_u8(output_ptr + i, vqmovun_s16(output_val));
+  }
+  index_t handled_output_size = output->size() - output->size() % 8;
+#pragma omp parallel for schedule(runtime)
+  for (index_t i = handled_output_size; i < output->size(); ++i) {
+    const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+    const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+    const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+    const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+    const int32_t multiplied_input0 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                        input0_multiplier),
+            -input0_shift);
+    const int32_t multiplied_input1 =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                        input1_multiplier),
+            -input1_shift);
+    int32_t res;
+    if (type_ == SUM) {
+      res = multiplied_input0 + multiplied_input1;
+    } else {
+      res = multiplied_input0 - multiplied_input1;
+    }
+    const int32_t output_val =
+        gemmlowp::RoundingDivideByPOT(
+            gemmlowp::SaturatingRoundingDoublingHighMul(res,
+                                                        output_multiplier),
+            -output_shift) + output->zero_point();
+    output_ptr[i] = Saturate<uint8_t>(output_val);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/q8/eltwise.h
+++ b/mace/ops/arm/q8/eltwise.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This implements matrix-vector multiplication described as
+// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
+#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
+#define MACE_OPS_ARM_Q8_ELTWISE_H_
+#include "mace/core/op_context.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/eltwise_type.h"
+namespace mace {
+namespace ops {
+namespace arm {
+namespace q8 {
+class Eltwise {
+ public:
+  explicit Eltwise(const EltwiseType type) : type_(type) {}
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input0,
+                     const Tensor *input1,
+                     Tensor *output);
+ private:
+  EltwiseType type_;
+};
+}  // namespace q8
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_Q8_ELTWISE_H_
--- a/mace/ops/common/eltwise_type.h
+++ b/mace/ops/common/eltwise_type.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_
+#define MACE_OPS_COMMON_ELTWISE_TYPE_H_
+namespace mace {
+namespace ops {
+enum EltwiseType {
+  SUM = 0,
+  SUB = 1,
+  PROD = 2,
+  DIV = 3,
+  MIN = 4,
+  MAX = 5,
+  NEG = 6,
+  ABS = 7,
+  SQR_DIFF = 8,
+  POW = 9,
+  EQUAL = 10,
+  FLOOR_DIV = 11,
+  NONE = 12,
+};
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_COMMON_ELTWISE_TYPE_H_
--- a/mace/ops/gemmlowp_util.h
+++ b/mace/ops/gemmlowp_util.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_GEMMLOWP_UTIL_H_
+#ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
-#define MACE_OPS_GEMMLOWP_UTIL_H_
+#define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
 #include <tuple>
@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
 };
 }  // namespace mace
-#endif  // MACE_OPS_GEMMLOWP_UTIL_H_
+#endif  // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -41,7 +41,7 @@
 #endif  // MACE_ENABLE_NEON
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #include "mace/ops/quantization_util.h"
 #endif  // MACE_ENABLE_QUANTIZE

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -12,6 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#ifdef MACE_ENABLE_NEON
+#ifdef MACE_ENABLE_QUANTIZE
+#include "mace/ops/arm/q8/eltwise.h"
+#endif  // MACE_ENABLE_QUANTIZE
+#endif  // MACE_ENABLE_NEON
 #include "mace/ops/eltwise.h"
 #include <algorithm>
@@ -1035,19 +1041,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1)),
        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0))) {}
+            "data_format", 0)))
+#ifdef MACE_ENABLE_NEON
+        , eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(ops::EltwiseType::NONE))))
+#endif
+  {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    const Tensor *input0 = this->Input(0);
-    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    MACE_CHECK(this->InputSize() == 2,
+               "Quantized Elementwise don't support broadcast now.");
+    const Tensor *input1 = this->Input(1);
    Tensor *output = this->Output(0);
-    MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
+    MACE_CHECK(type_ == SUM || type_ == SUB,
+               "Quantized Elementwise only support SUM and SUB now.");
    MACE_CHECK(input0->size() == input1->size(),
               "input0 and input1 must have the same shape.");
    MACE_CHECK(output->scale() != 0);
    MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
+#ifdef MACE_ENABLE_NEON
+    eltwise_.Compute(context, input0, input1, output);
+#else
    constexpr int left_shift = 20;
    const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
    const double adjusted_input0_scale = input0->scale() / doubled_scale;
@@ -1078,57 +1095,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
    auto input0_ptr = input0->data<uint8_t>();
    auto input1_ptr = input1->data<uint8_t>();
    auto output_ptr = output->mutable_data<uint8_t>();
-    index_t handled_output_size = 0;
-#ifdef MACE_ENABLE_NEON
-    #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
-      const auto input0_val = vld1_u8(input0_ptr + i);
-      const auto input1_val = vld1_u8(input1_ptr + i);
-      const auto input0_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input0_val));
-      const auto input1_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input1_val));
-      const auto offset_input0 =
-          vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
-      const auto offset_input1 =
-          vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
-      auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
-      auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
-      auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
-      auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
-      const auto left_shift_dup = vdupq_n_s32(left_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
-      input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
-      input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
-      input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
-      input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
-      const auto input0_shift_dup = vdupq_n_s32(input0_shift);
-      const auto input1_shift_dup = vdupq_n_s32(input1_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
-      auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
-      auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
-      sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
-      sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
-      sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
-      sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
-      const auto sum_low_s16 = vmovn_s32(sum_low);
-      const auto sum_high_s16 = vmovn_s32(sum_high);
-      const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
-                                                     sum_high_s16),
-                                        vdupq_n_s16(output->zero_point()));
-      vst1_u8(output_ptr + i, vqmovun_s16(output_val));
-    }
-    handled_output_size = output->size() - output->size() % 8;
-#endif  // NEON
 #pragma omp parallel for schedule(runtime)
-    for (index_t i = handled_output_size; i < output->size(); ++i) {
+    for (index_t i = 0; i < output->size(); ++i) {
      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
      const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
@@ -1143,14 +1111,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
                                                          input1_multiplier),
              -input1_shift);
-      const int32_t sum = multiplied_input0 + multiplied_input1;
+      int32_t res;
+      if (type_ == SUM) {
+        res = multiplied_input0 + multiplied_input1;
+      } else {
+        res = multiplied_input0 - multiplied_input1;
+      }
      const int32_t output_val =
          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(sum,
+              gemmlowp::SaturatingRoundingDoublingHighMul(res,
                                                          output_multiplier),
              -output_shift) + output->zero_point();
      output_ptr[i] = Saturate<uint8_t>(output_val);
    }
+#endif  // NEON
    return MaceStatus::MACE_SUCCESS;
  }
@@ -1162,6 +1138,9 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
  int32_t scalar_input_index_;
  DataFormat data_format_;
  Tensor scalar_tensor_;
+#ifdef MACE_ENABLE_NEON
+  arm::q8::Eltwise eltwise_;
+#endif
 };
 #endif  // MACE_ENABLE_QUANTIZE

--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -15,25 +15,11 @@
 #ifndef MACE_OPS_ELTWISE_H_
 #define MACE_OPS_ELTWISE_H_
+#include "mace/ops/common/eltwise_type.h"
 namespace mace {
 namespace ops {
-enum EltwiseType {
-  SUM = 0,
-  SUB = 1,
-  PROD = 2,
-  DIV = 3,
-  MIN = 4,
-  MAX = 5,
-  NEG = 6,
-  ABS = 7,
-  SQR_DIFF = 8,
-  POW = 9,
-  EQUAL = 10,
-  FLOOR_DIV = 11,
-  NONE = 12,
-};
 inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
 }  // namespace ops

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -30,12 +30,12 @@ void EltwiseBenchmark(
  OpsTestNet net;
  // Add input data
-  if (D == DeviceType::GPU) {
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value != DT_UINT8) {
-    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
-    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-  } else {
    net.AddRandomInput<D, T>("Input0", {n, c, h, w});
    net.AddRandomInput<D, T>("Input1", {n, c, h, w});
+  } else {
+    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
+    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
  }
  OpDefBuilder("Eltwise", "EltwiseTest")
@@ -47,15 +47,21 @@ void EltwiseBenchmark(
      .Output("Output")
      .Finalize(net.NewOperatorDef());
+  net.Setup(D);
+  if (D == DeviceType::CPU && DataTypeToEnum<T>::value == DT_UINT8) {
+    net.GetTensor("Output")->SetScale(0.1);
+  }
  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
+    net.Run();
    net.Sync();
  }
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(D);
+    net.Run();
    net.Sync();
  }
 }
@@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256);
 MACE_BM_ELTWISE(5, 1, 128, 128, 32);
 MACE_BM_ELTWISE(5, 1, 240, 240, 256);
+MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU);
+MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU);
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
  }
 }
-void QuantizedSum(const std::vector<index_t> &shape) {
+void Quantized(const std::vector<index_t> &shape,
+               const ops::EltwiseType type) {
  // Construct graph
  OpsTestNet net;
@@ -753,7 +754,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput0")
      .Input("TInput1")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(type))
      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());
@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
      .Input("QuantizedInput0")
      .Input("QuantizedInput1")
      .Output("QuantizedOutput")
-      .AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM))
+      .AddIntArg("type", static_cast<int>(type))
      .AddIntArg("T", static_cast<int>(DT_UINT8))
      .Finalize(net.NewOperatorDef());
  net.Setup(DeviceType::CPU);
@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
      {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36});
 }
-TEST_F(EltwiseOpTest, QuantizedSum) {
+TEST_F(EltwiseOpTest, Quantized) {
-  QuantizedSum({1, 32, 32, 16});
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM);
-  QuantizedSum({1, 31, 31, 17});
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM);
+  Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB);
+  Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB);
 }
 }  // namespace test

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -38,7 +38,7 @@
 #endif  // MACE_ENABLE_NEON
 #ifdef MACE_ENABLE_QUANTIZE
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -22,7 +22,7 @@
 #ifdef MACE_ENABLE_QUANTIZE
 #include "mace/ops/fixpoint.h"
-#include "mace/ops/gemmlowp_util.h"
+#include "mace/ops/common/gemmlowp_util.h"
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface):
            else:
                mace_check(op.type == MaceOp.Quantize.name,
                           "Quantization only support float ops, "
-                           "but get %s(%s)"
+                           "but get %s(%s, %s)"
-                           % (op.name, op.type))
+                           % (op.name, op.type,
+                              mace_pb2.DataType.Name(data_type_arg.i)))
        for input_node in self._option.input_nodes.values():
            new_input_name = self.input_name_map[input_node.name]
@@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface):
                    self.add_quantize_info(op, 0.0, 1.0)
                self._quantize_activation_info[op.output[0]] = quantize_info
            elif (op.type == MaceOp.Eltwise.name
-                  and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value  # noqa
                  and not op.quantize_info
                  and len(op.input) == 2
                  and len(op.input[0]) not in self._consts
                  and len(op.input[1]) not in self._consts):
-                del op.quantize_info[:]
                producer_op0 = self._producer[op.input[0]]
                producer_op1 = self._producer[op.input[1]]
-                minval = producer_op0.quantize_info[0].minval \
+                if ConverterUtil.get_arg(
-                    + producer_op1.quantize_info[0].minval
+                        op, MaceKeyword.mace_element_type_str).i \
-                maxval = producer_op0.quantize_info[0].maxval \
+                        == EltwiseType.SUM.value:
-                    + producer_op1.quantize_info[0].maxval
+                    minval = producer_op0.quantize_info[0].minval \
+                        + producer_op1.quantize_info[0].minval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        + producer_op1.quantize_info[0].maxval
+                elif ConverterUtil.get_arg(
+                        op, MaceKeyword.mace_element_type_str).i \
+                        == EltwiseType.SUB.value:
+                    minval = producer_op0.quantize_info[0].minval \
+                        - producer_op1.quantize_info[0].maxval
+                    maxval = producer_op0.quantize_info[0].maxval \
+                        - producer_op1.quantize_info[0].minval
+                else:
+                    mace_check(False, "Quantized Elementwise only support:"
+                                      " SUM and SUB now.")
                quantize_info = \
                    self.add_quantize_info(op, minval, maxval)
                self._quantize_activation_info[op.output[0]] = quantize_info

--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -19,6 +19,8 @@
 #include <cmath>
 #include <limits>
+#include "mace/utils/logging.h"
 namespace mace {
 template<typename T>
@@ -138,11 +140,6 @@ inline void Dequantize(const T *input,
 inline void QuantizeMultiplier(double multiplier,
                               int32_t* output_multiplier,
                               int32_t* shift) {
-  if (multiplier == 0.f) {
-    *output_multiplier = 0;
-    *shift = 0;
-    return;
-  }
  const double q = std::frexp(multiplier, shift);
  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
  if (qint == (1ll << 31)) {