diff --git a/mace/core/tensor.h b/mace/core/tensor.h index a497309f0ff18d6eed046c0a031caa2808671279..713a6d1e8d1755c485264c553cd795f304ddd3aa 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -142,7 +142,8 @@ class Tensor { buffer_ = &buffer_slice_; } - Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {} + explicit Tensor(bool is_weight = false) + : Tensor(GetCPUAllocator(), DT_FLOAT, is_weight) {} ~Tensor() { if (is_buffer_owner_ && buffer_ != nullptr) { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 07480ad4f3fc73f85492b53bc9e2270e766f0a24..4a34cd44fd4d53e64d618ce33988e69160497bbd 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -14,13 +14,12 @@ #include "mace/core/workspace.h" -#include -#include -#include #include #include #include "mace/core/arg_helper.h" +#include "mace/utils/quantize.h" + #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_runtime.h" #endif @@ -34,6 +33,15 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) { }; return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end(); } + +bool HasQuantizeOp(const NetDef &net_def) { + for (auto &op : net_def.op()) { + if (op.type() == "Quantize") { + return true; + } + } + return false; +} } // namespace Workspace::Workspace() : @@ -146,6 +154,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, 0, model_data_size); tensor_buffer_->UnMap(); } + bool has_quantize_op = HasQuantizeOp(net_def); for (auto &const_tensor : net_def.tensors()) { MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); VLOG(3) << "Tensor name: " << const_tensor.name() @@ -163,11 +172,27 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type())), const_tensor.data_type(), true)); - tensor->Reshape(dims); tensor->SetScale(const_tensor.scale()); tensor->SetZeroPoint(const_tensor.zero_point()); - tensor_map_[const_tensor.name()] = std::move(tensor); + + // Only weights are quantized + if (const_tensor.quantized() && !has_quantize_op) { + std::unique_ptr dequantized_tensor(new Tensor(true)); + dequantized_tensor->Resize(dims); + Tensor::MappingGuard quantize_guard(tensor.get()); + Tensor::MappingGuard dequantize_guard(dequantized_tensor.get()); + auto quantized_data = tensor->data(); + auto dequantized_data = dequantized_tensor->mutable_data(); + Dequantize(quantized_data, + tensor->size(), + tensor->scale(), + tensor->zero_point(), + dequantized_data); + tensor_map_[const_tensor.name()] = std::move(dequantized_tensor); + } else { + tensor_map_[const_tensor.name()] = std::move(tensor); + } } fused_buffer_ = true; } diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h index 28d45d3a6a80fc2e3af3b9b12cd71e38b23d474d..55b5e4b5517f773948786d406ff7c4d63f6a30f6 100644 --- a/mace/kernels/gemmlowp_util.h +++ b/mace/kernels/gemmlowp_util.h @@ -44,8 +44,8 @@ struct GemmlowpOutputPipeline { bias_addition_stage.bias_vector = bias_vector; int32_t quantized_multiplier; int32_t right_shift; - kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, - &quantized_multiplier, &right_shift); + GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, + &quantized_multiplier, &right_shift); gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage; quantize_down_stage.result_offset_after_shift = output_zero_point; @@ -62,8 +62,8 @@ struct GemmlowpOutputPipeline { const int32_t output_zero_point) { int32_t quantized_multiplier; int32_t right_shift; - kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, - &quantized_multiplier, &right_shift); + GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, + &quantized_multiplier, &right_shift); gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage; quantize_down_stage.result_offset_after_shift = output_zero_point; diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h index c0280931b0643d1b44a872f96171f1e67d1f2799..337a831618a407de3fb443e67159347196b18e33 100644 --- a/mace/kernels/quantize.h +++ b/mace/kernels/quantize.h @@ -23,154 +23,11 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" +#include "mace/utils/quantize.h" namespace mace { namespace kernels { -template -inline void AdjustRange(const float in_min_data, - const float in_max_data, - const bool non_zero, - float *scale, - int32_t *zero_point) { - // re-range to make range include zero float and - // make zero float as integer u8 - const T quantized_min = std::numeric_limits::lowest(); - const T quantized_max = std::numeric_limits::max(); - if (quantized_min < 0) { - MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value."); - } - - float out_max = std::max(0.f, in_max_data); - float out_min = std::min(0.f, in_min_data); - // make in_min_data quantize as greater than 1 - if (non_zero) { - out_min = std::min(out_min, - in_min_data - (out_max - in_min_data) - / (quantized_max - quantized_min - 1)); - } - - *scale = (out_max - out_min) / (quantized_max - quantized_min); - const float kEps = 1e-6; - if (out_min < -kEps && out_max > kEps) { - float quantized_zero = -out_min / *scale; - int32_t - quantized_zero_near_int = static_cast(roundf(quantized_zero)); - *zero_point = quantized_zero_near_int; - if (fabs(quantized_zero - quantized_zero_near_int) > kEps) { - if (quantized_zero < quantized_zero_near_int || non_zero) { - // keep out_max fixed, and move out_min - *zero_point = static_cast(std::ceil(quantized_zero)); - *scale = out_max / (quantized_max - *zero_point); - } else { - // keep out_min fixed, and move out_max - *scale = out_min / (quantized_min - *zero_point); - } - } - } else if (out_min > -kEps) { - *zero_point = quantized_min; - } else { - *zero_point = quantized_max; - } -} - -template -inline T Saturate(float value) { - int rounded_value = static_cast(value); - if (rounded_value <= std::numeric_limits::lowest()) { - return std::numeric_limits::lowest(); - } else if (rounded_value >= std::numeric_limits::max()) { - return std::numeric_limits::max(); - } else { - return static_cast(rounded_value); - } -} - -inline void FindMinMax(const float *input, - const index_t size, - float *min_val, float *max_val) { - float max_v = std::numeric_limits::lowest(); - float min_v = std::numeric_limits::max(); - for (index_t i = 0; i < size; ++i) { - max_v = std::max(max_v, input[i]); - min_v = std::min(min_v, input[i]); - } - *min_val = min_v; - *max_val = max_v; -} - -template -inline void QuantizeWithScaleAndZeropoint(const float *input, - const index_t size, - float scale, - int32_t zero_point, - T *output) { - float recip_scale = 1 / scale; -#pragma omp parallel for - for (int i = 0; i < size; ++i) { - output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); - } -} - -template -inline void Quantize(const float *input, - const index_t size, - bool non_zero, - T *output, - float *scale, - int32_t *zero_point) { - float in_min_data; - float in_max_data; - FindMinMax(input, size, &in_min_data, &in_max_data); - - AdjustRange(in_min_data, in_max_data, non_zero, - scale, zero_point); - - QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output); -} - -template -inline void Dequantize(const T *input, - const index_t size, - const float scale, - const int32_t zero_point, - float *output) { -#pragma omp parallel for - for (int i = 0; i < size; ++i) { - output[i] = scale * (input[i] - zero_point); - } -} - -inline void QuantizeMultiplier(double multiplier, - int32_t* output_multiplier, - int32_t* shift) { - if (multiplier == 0.f) { - *output_multiplier = 0; - *shift = 0; - return; - } - const double q = std::frexp(multiplier, shift); - auto qint = static_cast(roundl(q * (1ll << 31))); - if (qint == (1ll << 31)) { - qint /= 2; - ++*shift; - } - *output_multiplier = static_cast(qint); - MACE_CHECK(*output_multiplier <= std::numeric_limits::max()); -} - -inline void GetOutputMultiplierAndShift( - const float lhs_scale, const float rhs_scale, const float output_scale, - int32_t *quantized_multiplier, int *right_shift) { - float real_multiplier = lhs_scale * rhs_scale / output_scale; - MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier); - - int exponent; - QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent); - *right_shift = -exponent; - MACE_CHECK(*right_shift >= 0); -} - template struct QuantizeFunctor; diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index dd338e275356fd1bfab3cc21b50fade2b77da46e..750d64ef2feba2ca855e7dfe7e4c067d213b35ae 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -1174,7 +1174,7 @@ void TestQuant(const index_t batch, Tensor *bias = net.GetTensor("Bias"); auto bias_data = bias->data(); std::vector q_bias(bias->size()); - kernels::QuantizeWithScaleAndZeropoint( + QuantizeWithScaleAndZeropoint( bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0, q_bias.data()); net.AddInputFromArray("QuantizedBias", diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 6d6b84f1f79356d9f1eb6411fc564c912fca3e1d..7f7e1c17e7d25ef7c5813cdaf596f8dbb21ae6c1 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -459,7 +459,7 @@ void TestQuant(const index_t batch, Tensor *bias = net.GetTensor("Bias"); auto bias_data = bias->data(); std::vector q_bias(bias->size()); - kernels::QuantizeWithScaleAndZeropoint( + QuantizeWithScaleAndZeropoint( bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0, q_bias.data()); net.AddInputFromArray( diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index cdeba2439a94e5987c9844c4482f57af78dbb14c..4bdc53f4cab924886e1d9df5c582bac26e2dfa7c 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -277,7 +277,7 @@ void QuantRandom(const index_t batch, Tensor *bias = net.GetTensor("Bias"); auto bias_data = bias->data(); std::vector q_bias(bias->size()); - kernels::QuantizeWithScaleAndZeropoint( + QuantizeWithScaleAndZeropoint( bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0, q_bias.data()); net.AddInputFromArray("QuantizedBias", diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 9ec3d96e27a919eeef74cce716e6006a7e66498d..0bf4c7da2c8b4d77dd042bd2c0c7a30cf183a942 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -36,6 +36,7 @@ message ConstTensor { optional int64 data_size = 7; optional float scale = 8; optional int32 zero_point = 9; + optional bool quantized = 10 [default = false]; optional uint32 node_id = 100; } diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 2302626d1ce83c7a718be6c974f03282d79db4a0..5fbab749ddcf0f163bad9f42ee3987c989712ce8 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1667,9 +1667,6 @@ class Transformer(base_converter.ConverterInterface): def quantize_tensor(self, tensor): """Assume biasadd has been already folded with convolution and fc""" - if not self._option.quantize: - return False - if tensor.data_type == mace_pb2.DT_FLOAT: ops = self._consumers.get(tensor.name, None) if len(ops) == 1 and ops[0].type in [MaceOp.Conv2D.name, @@ -1697,6 +1694,7 @@ class Transformer(base_converter.ConverterInterface): tensor.int32_data.extend(quantized_tensor.data) tensor.scale = quantized_tensor.scale tensor.zero_point = quantized_tensor.zero + tensor.quantized = True self._quantized_tensor.update([tensor.name]) return False diff --git a/mace/python/tools/tensor_source.jinja2 b/mace/python/tools/tensor_source.jinja2 index 66feee0ed7af780806e46ae37080f084cf8c0efb..9cb3805d519e56f53b70ac39fe7b81138fc2b0ae 100644 --- a/mace/python/tools/tensor_source.jinja2 +++ b/mace/python/tools/tensor_source.jinja2 @@ -34,6 +34,7 @@ void CreateTensor{{tensor_info.id}}(mace::ConstTensor *const_tensor) { const_tensor->set_node_id({{ tensor.node_id }}); const_tensor->set_scale({{ tensor.scale }}); const_tensor->set_zero_point({{ tensor.zero_point }}); + const_tensor->set_quantized({{ tensor.quantized | lower}}); } } // namespace {{tag}} diff --git a/mace/utils/BUILD b/mace/utils/BUILD index c383eb87d1d56ae3ae0e7aec13517a43128ad66a..283efa490e0a54ed48df9a61a289e5b67bf503f8 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD @@ -15,20 +15,17 @@ cc_library( "logging.cc", "string_util.cc", ], - hdrs = [ - "env_time.h", - "logging.h", - "memory_logging.h", - "rwlock.h", - "string_util.h", - "timer.h", - "tuner.h", - "utils.h", + hdrs = glob([ + "*.h", + ]), + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", ], linkopts = if_android([ "-llog", ]), - copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], deps = [ "//mace/public", ], @@ -40,7 +37,11 @@ cc_test( srcs = [ "tuner_test.cc", ], - copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], + copts = [ + "-Werror", + "-Wextra", + "-Wno-missing-field-initializers", + ], linkopts = ["-ldl"] + if_android([ "-pie", "-lm", # Required by unordered_map diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h new file mode 100644 index 0000000000000000000000000000000000000000..dfaaff1560925c6d1674958ea8f9ae55f4842dd6 --- /dev/null +++ b/mace/utils/quantize.h @@ -0,0 +1,169 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_UTILS_QUANTIZE_H_ +#define MACE_UTILS_QUANTIZE_H_ + +#include +#include + +namespace mace { + +template +inline void AdjustRange(const float in_min_data, + const float in_max_data, + const bool non_zero, + float *scale, + int32_t *zero_point) { + // re-range to make range include zero float and + // make zero float as integer u8 + const T quantized_min = std::numeric_limits::lowest(); + const T quantized_max = std::numeric_limits::max(); + if (quantized_min < 0) { + MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value."); + } + + float out_max = std::max(0.f, in_max_data); + float out_min = std::min(0.f, in_min_data); + // make in_min_data quantize as greater than 1 + if (non_zero) { + out_min = std::min(out_min, + in_min_data - (out_max - in_min_data) + / (quantized_max - quantized_min - 1)); + } + + *scale = (out_max - out_min) / (quantized_max - quantized_min); + const float kEps = 1e-6; + if (out_min < -kEps && out_max > kEps) { + float quantized_zero = -out_min / *scale; + int32_t + quantized_zero_near_int = static_cast(roundf(quantized_zero)); + *zero_point = quantized_zero_near_int; + if (fabs(quantized_zero - quantized_zero_near_int) > kEps) { + if (quantized_zero < quantized_zero_near_int || non_zero) { + // keep out_max fixed, and move out_min + *zero_point = static_cast(std::ceil(quantized_zero)); + *scale = out_max / (quantized_max - *zero_point); + } else { + // keep out_min fixed, and move out_max + *scale = out_min / (quantized_min - *zero_point); + } + } + } else if (out_min > -kEps) { + *zero_point = quantized_min; + } else { + *zero_point = quantized_max; + } +} + +template +inline T Saturate(float value) { + int rounded_value = static_cast(value); + if (rounded_value <= std::numeric_limits::lowest()) { + return std::numeric_limits::lowest(); + } else if (rounded_value >= std::numeric_limits::max()) { + return std::numeric_limits::max(); + } else { + return static_cast(rounded_value); + } +} + +inline void FindMinMax(const float *input, + const index_t size, + float *min_val, float *max_val) { + float max_v = std::numeric_limits::lowest(); + float min_v = std::numeric_limits::max(); + for (index_t i = 0; i < size; ++i) { + max_v = std::max(max_v, input[i]); + min_v = std::min(min_v, input[i]); + } + *min_val = min_v; + *max_val = max_v; +} + +template +inline void QuantizeWithScaleAndZeropoint(const float *input, + const index_t size, + float scale, + int32_t zero_point, + T *output) { + float recip_scale = 1 / scale; +#pragma omp parallel for + for (int i = 0; i < size; ++i) { + output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); + } +} + +template +inline void Quantize(const float *input, + const index_t size, + bool non_zero, + T *output, + float *scale, + int32_t *zero_point) { + float in_min_data; + float in_max_data; + FindMinMax(input, size, &in_min_data, &in_max_data); + + AdjustRange(in_min_data, in_max_data, non_zero, + scale, zero_point); + + QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output); +} + +template +inline void Dequantize(const T *input, + const index_t size, + const float scale, + const int32_t zero_point, + float *output) { +#pragma omp parallel for + for (int i = 0; i < size; ++i) { + output[i] = scale * (input[i] - zero_point); + } +} + +inline void QuantizeMultiplier(double multiplier, + int32_t* output_multiplier, + int32_t* shift) { + if (multiplier == 0.f) { + *output_multiplier = 0; + *shift = 0; + return; + } + const double q = std::frexp(multiplier, shift); + auto qint = static_cast(roundl(q * (1ll << 31))); + if (qint == (1ll << 31)) { + qint /= 2; + ++*shift; + } + *output_multiplier = static_cast(qint); + MACE_CHECK(*output_multiplier <= std::numeric_limits::max()); +} + +inline void GetOutputMultiplierAndShift( + const float lhs_scale, const float rhs_scale, const float output_scale, + int32_t *quantized_multiplier, int *right_shift) { + float real_multiplier = lhs_scale * rhs_scale / output_scale; + MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier); + + int exponent; + QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent); + *right_shift = -exponent; + MACE_CHECK(*right_shift >= 0); +} + +} // namespace mace + +#endif // MACE_UTILS_QUANTIZE_H_