Merge branch 'quantize-weights' into 'master'

Support quantize-weights only See merge request !801

Merge branch 'quantize-weights' into 'master'
Support quantize-weights only See merge request !801
3183be4d · 李滨 · 34797c2b · 6c65796f · 3183be4d · 3183be4d
12 changed file
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -142,7 +142,8 @@ class Tensor {
    buffer_ = &buffer_slice_;
  }

-  Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
+  explicit Tensor(bool is_weight = false)
+      : Tensor(GetCPUAllocator(), DT_FLOAT, is_weight) {}

  ~Tensor() {
    if (is_buffer_owner_ && buffer_ != nullptr) {

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -14,13 +14,12 @@

 #include "mace/core/workspace.h"

-#include <memory>
-#include <string>
-#include <vector>
 #include <unordered_set>
 #include <utility>

 #include "mace/core/arg_helper.h"
+#include "mace/utils/quantize.h"
+
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif
@@ -34,6 +33,15 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
  };
  return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
 }
+
+bool HasQuantizeOp(const NetDef &net_def) {
+  for (auto &op : net_def.op()) {
+    if (op.type() == "Quantize") {
+      return true;
+    }
+  }
+  return false;
+}
 }  // namespace

 Workspace::Workspace() :
@@ -146,6 +154,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                             0, model_data_size);
        tensor_buffer_->UnMap();
      }
+      bool has_quantize_op = HasQuantizeOp(net_def);
      for (auto &const_tensor : net_def.tensors()) {
        MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
        VLOG(3) << "Tensor name: " << const_tensor.name()
@@ -163,11 +172,27 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                const_tensor.data_size() *
                    GetEnumTypeSize(const_tensor.data_type())),
                       const_tensor.data_type(), true));
-
        tensor->Reshape(dims);
        tensor->SetScale(const_tensor.scale());
        tensor->SetZeroPoint(const_tensor.zero_point());
-        tensor_map_[const_tensor.name()] = std::move(tensor);
+
+        // Only weights are quantized
+        if (const_tensor.quantized() && !has_quantize_op) {
+          std::unique_ptr<Tensor> dequantized_tensor(new Tensor(true));
+          dequantized_tensor->Resize(dims);
+          Tensor::MappingGuard quantize_guard(tensor.get());
+          Tensor::MappingGuard dequantize_guard(dequantized_tensor.get());
+          auto quantized_data = tensor->data<uint8_t>();
+          auto dequantized_data = dequantized_tensor->mutable_data<float>();
+          Dequantize(quantized_data,
+                     tensor->size(),
+                     tensor->scale(),
+                     tensor->zero_point(),
+                     dequantized_data);
+          tensor_map_[const_tensor.name()] = std::move(dequantized_tensor);
+        } else {
+          tensor_map_[const_tensor.name()] = std::move(tensor);
+        }
      }
      fused_buffer_ = true;
    }

--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -42,8 +42,8 @@ struct GemmlowpOutputPipeline {
    bias_addition_stage.bias_vector = bias_vector;
    int32_t quantized_multiplier;
    int32_t right_shift;
-    kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
-                                         &quantized_multiplier, &right_shift);
+    GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
+                                &quantized_multiplier, &right_shift);
    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
        quantize_down_stage;
    quantize_down_stage.result_offset_after_shift = output_zero_point;
@@ -60,8 +60,8 @@ struct GemmlowpOutputPipeline {
      const int32_t output_zero_point) {
    int32_t quantized_multiplier;
    int32_t right_shift;
-    kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
-                                         &quantized_multiplier, &right_shift);
+    GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
+                                &quantized_multiplier, &right_shift);
    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
        quantize_down_stage;
    quantize_down_stage.result_offset_after_shift = output_zero_point;

--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -23,154 +23,11 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
+#include "mace/utils/quantize.h"

 namespace mace {
 namespace kernels {

-template<typename T>
-inline void AdjustRange(const float in_min_data,
-                        const float in_max_data,
-                        const bool non_zero,
-                        float *scale,
-                        int32_t *zero_point) {
-  // re-range to make range include zero float and
-  // make zero float as integer u8
-  const T quantized_min = std::numeric_limits<T>::lowest();
-  const T quantized_max = std::numeric_limits<T>::max();
-  if (quantized_min < 0) {
-    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
-  }
-
-  float out_max = std::max(0.f, in_max_data);
-  float out_min = std::min(0.f, in_min_data);
-  // make in_min_data quantize as greater than 1
-  if (non_zero) {
-    out_min = std::min(out_min,
-                       in_min_data - (out_max - in_min_data)
-                           / (quantized_max - quantized_min - 1));
-  }
-
-  *scale = (out_max - out_min) / (quantized_max - quantized_min);
-  const float kEps = 1e-6;
-  if (out_min < -kEps && out_max > kEps) {
-    float quantized_zero = -out_min / *scale;
-    int32_t
-        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
-    *zero_point = quantized_zero_near_int;
-    if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
-      if (quantized_zero < quantized_zero_near_int || non_zero) {
-        // keep out_max fixed, and move out_min
-        *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
-        *scale = out_max / (quantized_max - *zero_point);
-      } else {
-        // keep out_min fixed, and move out_max
-        *scale = out_min / (quantized_min - *zero_point);
-      }
-    }
-  } else if (out_min > -kEps) {
-    *zero_point = quantized_min;
-  } else {
-    *zero_point = quantized_max;
-  }
-}
-
-template<typename T>
-inline T Saturate(float value) {
-  int rounded_value = static_cast<int>(value);
-  if (rounded_value <= std::numeric_limits<T>::lowest()) {
-    return std::numeric_limits<T>::lowest();
-  } else if (rounded_value >= std::numeric_limits<T>::max()) {
-    return std::numeric_limits<T>::max();
-  } else {
-    return static_cast<T>(rounded_value);
-  }
-}
-
-inline void FindMinMax(const float *input,
-                       const index_t size,
-                       float *min_val, float *max_val) {
-  float max_v = std::numeric_limits<float>::lowest();
-  float min_v = std::numeric_limits<float>::max();
-  for (index_t i = 0; i < size; ++i) {
-    max_v = std::max(max_v, input[i]);
-    min_v = std::min(min_v, input[i]);
-  }
-  *min_val = min_v;
-  *max_val = max_v;
-}
-
-template<typename T>
-inline void QuantizeWithScaleAndZeropoint(const float *input,
-                                          const index_t size,
-                                          float scale,
-                                          int32_t zero_point,
-                                          T *output) {
-  float recip_scale = 1 / scale;
-#pragma omp parallel for
-  for (int i = 0; i < size; ++i) {
-    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
-  }
-}
-
-template<typename T>
-inline void Quantize(const float *input,
-                     const index_t size,
-                     bool non_zero,
-                     T *output,
-                     float *scale,
-                     int32_t *zero_point) {
-  float in_min_data;
-  float in_max_data;
-  FindMinMax(input, size, &in_min_data, &in_max_data);
-
-  AdjustRange<T>(in_min_data, in_max_data, non_zero,
-                 scale, zero_point);
-
-  QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
-}
-
-template<typename T>
-inline void Dequantize(const T *input,
-                       const index_t size,
-                       const float scale,
-                       const int32_t zero_point,
-                       float *output) {
-#pragma omp parallel for
-  for (int i = 0; i < size; ++i) {
-    output[i] = scale * (input[i] - zero_point);
-  }
-}
-
-inline void QuantizeMultiplier(double multiplier,
-                               int32_t* output_multiplier,
-                               int32_t* shift) {
-  if (multiplier == 0.f) {
-    *output_multiplier = 0;
-    *shift = 0;
-    return;
-  }
-  const double q = std::frexp(multiplier, shift);
-  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
-  if (qint == (1ll << 31)) {
-    qint /= 2;
-    ++*shift;
-  }
-  *output_multiplier = static_cast<int32_t>(qint);
-  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
-}
-
-inline void GetOutputMultiplierAndShift(
-    const float lhs_scale, const float rhs_scale, const float output_scale,
-    int32_t *quantized_multiplier, int *right_shift) {
-  float real_multiplier = lhs_scale * rhs_scale / output_scale;
-  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
-
-  int exponent;
-  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
-  *right_shift = -exponent;
-  MACE_CHECK(*right_shift >= 0);
-}
-
 template<DeviceType D, typename T>
 struct QuantizeFunctor;


--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -1174,7 +1174,7 @@ void TestQuant(const index_t batch,
  Tensor *bias = net.GetTensor("Bias");
  auto bias_data = bias->data<float>();
  std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
      bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
      q_bias.data());
  net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -459,7 +459,7 @@ void TestQuant(const index_t batch,
  Tensor *bias = net.GetTensor("Bias");
  auto bias_data = bias->data<float>();
  std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
      bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
      q_bias.data());
  net.AddInputFromArray<DeviceType::CPU, int32_t>(

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -277,7 +277,7 @@ void QuantRandom(const index_t batch,
  Tensor *bias = net.GetTensor("Bias");
  auto bias_data = bias->data<float>();
  std::vector<int32_t> q_bias(bias->size());
-  kernels::QuantizeWithScaleAndZeropoint(
+  QuantizeWithScaleAndZeropoint(
      bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0,
      q_bias.data());
  net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",

--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -36,6 +36,7 @@ message ConstTensor {
  optional int64 data_size = 7;
  optional float scale = 8;
  optional int32 zero_point = 9;
+  optional bool quantized = 10 [default = false];

  optional uint32 node_id = 100;
 }

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1667,9 +1667,6 @@ class Transformer(base_converter.ConverterInterface):

    def quantize_tensor(self, tensor):
        """Assume biasadd has been already folded with convolution and fc"""
-        if not self._option.quantize:
-            return False
-
        if tensor.data_type == mace_pb2.DT_FLOAT:
            ops = self._consumers.get(tensor.name, None)
            if len(ops) == 1 and ops[0].type in [MaceOp.Conv2D.name,
@@ -1697,6 +1694,7 @@ class Transformer(base_converter.ConverterInterface):
            tensor.int32_data.extend(quantized_tensor.data)
            tensor.scale = quantized_tensor.scale
            tensor.zero_point = quantized_tensor.zero
+            tensor.quantized = True
            self._quantized_tensor.update([tensor.name])

        return False

--- a/mace/python/tools/tensor_source.jinja2
+++ b/mace/python/tools/tensor_source.jinja2
@@ -34,6 +34,7 @@ void CreateTensor{{tensor_info.id}}(mace::ConstTensor *const_tensor) {
  const_tensor->set_node_id({{ tensor.node_id }});
  const_tensor->set_scale({{ tensor.scale }});
  const_tensor->set_zero_point({{ tensor.zero_point }});
+  const_tensor->set_quantized({{ tensor.quantized | lower}});
 }

 }  // namespace {{tag}}

--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -15,20 +15,17 @@ cc_library(
        "logging.cc",
        "string_util.cc",
    ],
-    hdrs = [
-        "env_time.h",
-        "logging.h",
-        "memory_logging.h",
-        "rwlock.h",
-        "string_util.h",
-        "timer.h",
-        "tuner.h",
-        "utils.h",
+    hdrs = glob([
+        "*.h",
+    ]),
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
    ],
    linkopts = if_android([
        "-llog",
    ]),
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
    deps = [
        "//mace/public",
    ],
@@ -40,7 +37,11 @@ cc_test(
    srcs = [
        "tuner_test.cc",
    ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+    copts = [
+        "-Werror",
+        "-Wextra",
+        "-Wno-missing-field-initializers",
+    ],
    linkopts = ["-ldl"] + if_android([
        "-pie",
        "-lm",  # Required by unordered_map

--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_UTILS_QUANTIZE_H_
+#define MACE_UTILS_QUANTIZE_H_
+
+#include <limits>
+#include <algorithm>
+
+namespace mace {
+
+template<typename T>
+inline void AdjustRange(const float in_min_data,
+                        const float in_max_data,
+                        const bool non_zero,
+                        float *scale,
+                        int32_t *zero_point) {
+  // re-range to make range include zero float and
+  // make zero float as integer u8
+  const T quantized_min = std::numeric_limits<T>::lowest();
+  const T quantized_max = std::numeric_limits<T>::max();
+  if (quantized_min < 0) {
+    MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
+  }
+
+  float out_max = std::max(0.f, in_max_data);
+  float out_min = std::min(0.f, in_min_data);
+  // make in_min_data quantize as greater than 1
+  if (non_zero) {
+    out_min = std::min(out_min,
+                       in_min_data - (out_max - in_min_data)
+                           / (quantized_max - quantized_min - 1));
+  }
+
+  *scale = (out_max - out_min) / (quantized_max - quantized_min);
+  const float kEps = 1e-6;
+  if (out_min < -kEps && out_max > kEps) {
+    float quantized_zero = -out_min / *scale;
+    int32_t
+        quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
+    *zero_point = quantized_zero_near_int;
+    if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
+      if (quantized_zero < quantized_zero_near_int || non_zero) {
+        // keep out_max fixed, and move out_min
+        *zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
+        *scale = out_max / (quantized_max - *zero_point);
+      } else {
+        // keep out_min fixed, and move out_max
+        *scale = out_min / (quantized_min - *zero_point);
+      }
+    }
+  } else if (out_min > -kEps) {
+    *zero_point = quantized_min;
+  } else {
+    *zero_point = quantized_max;
+  }
+}
+
+template<typename T>
+inline T Saturate(float value) {
+  int rounded_value = static_cast<int>(value);
+  if (rounded_value <= std::numeric_limits<T>::lowest()) {
+    return std::numeric_limits<T>::lowest();
+  } else if (rounded_value >= std::numeric_limits<T>::max()) {
+    return std::numeric_limits<T>::max();
+  } else {
+    return static_cast<T>(rounded_value);
+  }
+}
+
+inline void FindMinMax(const float *input,
+                       const index_t size,
+                       float *min_val, float *max_val) {
+  float max_v = std::numeric_limits<float>::lowest();
+  float min_v = std::numeric_limits<float>::max();
+  for (index_t i = 0; i < size; ++i) {
+    max_v = std::max(max_v, input[i]);
+    min_v = std::min(min_v, input[i]);
+  }
+  *min_val = min_v;
+  *max_val = max_v;
+}
+
+template<typename T>
+inline void QuantizeWithScaleAndZeropoint(const float *input,
+                                          const index_t size,
+                                          float scale,
+                                          int32_t zero_point,
+                                          T *output) {
+  float recip_scale = 1 / scale;
+#pragma omp parallel for
+  for (int i = 0; i < size; ++i) {
+    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
+  }
+}
+
+template<typename T>
+inline void Quantize(const float *input,
+                     const index_t size,
+                     bool non_zero,
+                     T *output,
+                     float *scale,
+                     int32_t *zero_point) {
+  float in_min_data;
+  float in_max_data;
+  FindMinMax(input, size, &in_min_data, &in_max_data);
+
+  AdjustRange<T>(in_min_data, in_max_data, non_zero,
+                 scale, zero_point);
+
+  QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
+}
+
+template<typename T>
+inline void Dequantize(const T *input,
+                       const index_t size,
+                       const float scale,
+                       const int32_t zero_point,
+                       float *output) {
+#pragma omp parallel for
+  for (int i = 0; i < size; ++i) {
+    output[i] = scale * (input[i] - zero_point);
+  }
+}
+
+inline void QuantizeMultiplier(double multiplier,
+                               int32_t* output_multiplier,
+                               int32_t* shift) {
+  if (multiplier == 0.f) {
+    *output_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(multiplier, shift);
+  auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
+  if (qint == (1ll << 31)) {
+    qint /= 2;
+    ++*shift;
+  }
+  *output_multiplier = static_cast<int32_t>(qint);
+  MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
+}
+
+inline void GetOutputMultiplierAndShift(
+    const float lhs_scale, const float rhs_scale, const float output_scale,
+    int32_t *quantized_multiplier, int *right_shift) {
+  float real_multiplier = lhs_scale * rhs_scale / output_scale;
+  MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
+
+  int exponent;
+  QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
+  *right_shift = -exponent;
+  MACE_CHECK(*right_shift >= 0);
+}
+
+}  // namespace mace
+
+#endif  // MACE_UTILS_QUANTIZE_H_