提交 3183be4d 编写于 作者: 李滨

Merge branch 'quantize-weights' into 'master'

Support quantize-weights only

See merge request !801
......@@ -142,7 +142,8 @@ class Tensor {
buffer_ = &buffer_slice_;
}
Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
explicit Tensor(bool is_weight = false)
: Tensor(GetCPUAllocator(), DT_FLOAT, is_weight) {}
~Tensor() {
if (is_buffer_owner_ && buffer_ != nullptr) {
......
......@@ -14,13 +14,12 @@
#include "mace/core/workspace.h"
#include <memory>
#include <string>
#include <vector>
#include <unordered_set>
#include <utility>
#include "mace/core/arg_helper.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
......@@ -34,6 +33,15 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
};
return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
}
bool HasQuantizeOp(const NetDef &net_def) {
for (auto &op : net_def.op()) {
if (op.type() == "Quantize") {
return true;
}
}
return false;
}
} // namespace
Workspace::Workspace() :
......@@ -146,6 +154,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
0, model_data_size);
tensor_buffer_->UnMap();
}
bool has_quantize_op = HasQuantizeOp(net_def);
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
......@@ -163,11 +172,27 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type(), true));
tensor->Reshape(dims);
tensor->SetScale(const_tensor.scale());
tensor->SetZeroPoint(const_tensor.zero_point());
tensor_map_[const_tensor.name()] = std::move(tensor);
// Only weights are quantized
if (const_tensor.quantized() && !has_quantize_op) {
std::unique_ptr<Tensor> dequantized_tensor(new Tensor(true));
dequantized_tensor->Resize(dims);
Tensor::MappingGuard quantize_guard(tensor.get());
Tensor::MappingGuard dequantize_guard(dequantized_tensor.get());
auto quantized_data = tensor->data<uint8_t>();
auto dequantized_data = dequantized_tensor->mutable_data<float>();
Dequantize(quantized_data,
tensor->size(),
tensor->scale(),
tensor->zero_point(),
dequantized_data);
tensor_map_[const_tensor.name()] = std::move(dequantized_tensor);
} else {
tensor_map_[const_tensor.name()] = std::move(tensor);
}
}
fused_buffer_ = true;
}
......
......@@ -42,8 +42,8 @@ struct GemmlowpOutputPipeline {
bias_addition_stage.bias_vector = bias_vector;
int32_t quantized_multiplier;
int32_t right_shift;
kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage;
quantize_down_stage.result_offset_after_shift = output_zero_point;
......@@ -60,8 +60,8 @@ struct GemmlowpOutputPipeline {
const int32_t output_zero_point) {
int32_t quantized_multiplier;
int32_t right_shift;
kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage;
quantize_down_stage.result_offset_after_shift = output_zero_point;
......
......@@ -23,154 +23,11 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/quantize.h"
namespace mace {
namespace kernels {
template<typename T>
inline void AdjustRange(const float in_min_data,
const float in_max_data,
const bool non_zero,
float *scale,
int32_t *zero_point) {
// re-range to make range include zero float and
// make zero float as integer u8
const T quantized_min = std::numeric_limits<T>::lowest();
const T quantized_max = std::numeric_limits<T>::max();
if (quantized_min < 0) {
MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
}
float out_max = std::max(0.f, in_max_data);
float out_min = std::min(0.f, in_min_data);
// make in_min_data quantize as greater than 1
if (non_zero) {
out_min = std::min(out_min,
in_min_data - (out_max - in_min_data)
/ (quantized_max - quantized_min - 1));
}
*scale = (out_max - out_min) / (quantized_max - quantized_min);
const float kEps = 1e-6;
if (out_min < -kEps && out_max > kEps) {
float quantized_zero = -out_min / *scale;
int32_t
quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
*zero_point = quantized_zero_near_int;
if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
if (quantized_zero < quantized_zero_near_int || non_zero) {
// keep out_max fixed, and move out_min
*zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
*scale = out_max / (quantized_max - *zero_point);
} else {
// keep out_min fixed, and move out_max
*scale = out_min / (quantized_min - *zero_point);
}
}
} else if (out_min > -kEps) {
*zero_point = quantized_min;
} else {
*zero_point = quantized_max;
}
}
template<typename T>
inline T Saturate(float value) {
int rounded_value = static_cast<int>(value);
if (rounded_value <= std::numeric_limits<T>::lowest()) {
return std::numeric_limits<T>::lowest();
} else if (rounded_value >= std::numeric_limits<T>::max()) {
return std::numeric_limits<T>::max();
} else {
return static_cast<T>(rounded_value);
}
}
inline void FindMinMax(const float *input,
const index_t size,
float *min_val, float *max_val) {
float max_v = std::numeric_limits<float>::lowest();
float min_v = std::numeric_limits<float>::max();
for (index_t i = 0; i < size; ++i) {
max_v = std::max(max_v, input[i]);
min_v = std::min(min_v, input[i]);
}
*min_val = min_v;
*max_val = max_v;
}
template<typename T>
inline void QuantizeWithScaleAndZeropoint(const float *input,
const index_t size,
float scale,
int32_t zero_point,
T *output) {
float recip_scale = 1 / scale;
#pragma omp parallel for
for (int i = 0; i < size; ++i) {
output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
}
}
template<typename T>
inline void Quantize(const float *input,
const index_t size,
bool non_zero,
T *output,
float *scale,
int32_t *zero_point) {
float in_min_data;
float in_max_data;
FindMinMax(input, size, &in_min_data, &in_max_data);
AdjustRange<T>(in_min_data, in_max_data, non_zero,
scale, zero_point);
QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
}
template<typename T>
inline void Dequantize(const T *input,
const index_t size,
const float scale,
const int32_t zero_point,
float *output) {
#pragma omp parallel for
for (int i = 0; i < size; ++i) {
output[i] = scale * (input[i] - zero_point);
}
}
inline void QuantizeMultiplier(double multiplier,
int32_t* output_multiplier,
int32_t* shift) {
if (multiplier == 0.f) {
*output_multiplier = 0;
*shift = 0;
return;
}
const double q = std::frexp(multiplier, shift);
auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
if (qint == (1ll << 31)) {
qint /= 2;
++*shift;
}
*output_multiplier = static_cast<int32_t>(qint);
MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
}
inline void GetOutputMultiplierAndShift(
const float lhs_scale, const float rhs_scale, const float output_scale,
int32_t *quantized_multiplier, int *right_shift) {
float real_multiplier = lhs_scale * rhs_scale / output_scale;
MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
int exponent;
QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
*right_shift = -exponent;
MACE_CHECK(*right_shift >= 0);
}
template<DeviceType D, typename T>
struct QuantizeFunctor;
......
......@@ -1174,7 +1174,7 @@ void TestQuant(const index_t batch,
Tensor *bias = net.GetTensor("Bias");
auto bias_data = bias->data<float>();
std::vector<int32_t> q_bias(bias->size());
kernels::QuantizeWithScaleAndZeropoint(
QuantizeWithScaleAndZeropoint(
bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
q_bias.data());
net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
......
......@@ -459,7 +459,7 @@ void TestQuant(const index_t batch,
Tensor *bias = net.GetTensor("Bias");
auto bias_data = bias->data<float>();
std::vector<int32_t> q_bias(bias->size());
kernels::QuantizeWithScaleAndZeropoint(
QuantizeWithScaleAndZeropoint(
bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
q_bias.data());
net.AddInputFromArray<DeviceType::CPU, int32_t>(
......
......@@ -277,7 +277,7 @@ void QuantRandom(const index_t batch,
Tensor *bias = net.GetTensor("Bias");
auto bias_data = bias->data<float>();
std::vector<int32_t> q_bias(bias->size());
kernels::QuantizeWithScaleAndZeropoint(
QuantizeWithScaleAndZeropoint(
bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0,
q_bias.data());
net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
......
......@@ -36,6 +36,7 @@ message ConstTensor {
optional int64 data_size = 7;
optional float scale = 8;
optional int32 zero_point = 9;
optional bool quantized = 10 [default = false];
optional uint32 node_id = 100;
}
......
......@@ -1667,9 +1667,6 @@ class Transformer(base_converter.ConverterInterface):
def quantize_tensor(self, tensor):
"""Assume biasadd has been already folded with convolution and fc"""
if not self._option.quantize:
return False
if tensor.data_type == mace_pb2.DT_FLOAT:
ops = self._consumers.get(tensor.name, None)
if len(ops) == 1 and ops[0].type in [MaceOp.Conv2D.name,
......@@ -1697,6 +1694,7 @@ class Transformer(base_converter.ConverterInterface):
tensor.int32_data.extend(quantized_tensor.data)
tensor.scale = quantized_tensor.scale
tensor.zero_point = quantized_tensor.zero
tensor.quantized = True
self._quantized_tensor.update([tensor.name])
return False
......
......@@ -34,6 +34,7 @@ void CreateTensor{{tensor_info.id}}(mace::ConstTensor *const_tensor) {
const_tensor->set_node_id({{ tensor.node_id }});
const_tensor->set_scale({{ tensor.scale }});
const_tensor->set_zero_point({{ tensor.zero_point }});
const_tensor->set_quantized({{ tensor.quantized | lower}});
}
} // namespace {{tag}}
......
......@@ -15,20 +15,17 @@ cc_library(
"logging.cc",
"string_util.cc",
],
hdrs = [
"env_time.h",
"logging.h",
"memory_logging.h",
"rwlock.h",
"string_util.h",
"timer.h",
"tuner.h",
"utils.h",
hdrs = glob([
"*.h",
]),
copts = [
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
],
linkopts = if_android([
"-llog",
]),
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
deps = [
"//mace/public",
],
......@@ -40,7 +37,11 @@ cc_test(
srcs = [
"tuner_test.cc",
],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
copts = [
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
],
linkopts = ["-ldl"] + if_android([
"-pie",
"-lm", # Required by unordered_map
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_UTILS_QUANTIZE_H_
#define MACE_UTILS_QUANTIZE_H_
#include <limits>
#include <algorithm>
namespace mace {
template<typename T>
inline void AdjustRange(const float in_min_data,
const float in_max_data,
const bool non_zero,
float *scale,
int32_t *zero_point) {
// re-range to make range include zero float and
// make zero float as integer u8
const T quantized_min = std::numeric_limits<T>::lowest();
const T quantized_max = std::numeric_limits<T>::max();
if (quantized_min < 0) {
MACE_ASSERT(!non_zero, "Cannot nudge to non_zero quantize value.");
}
float out_max = std::max(0.f, in_max_data);
float out_min = std::min(0.f, in_min_data);
// make in_min_data quantize as greater than 1
if (non_zero) {
out_min = std::min(out_min,
in_min_data - (out_max - in_min_data)
/ (quantized_max - quantized_min - 1));
}
*scale = (out_max - out_min) / (quantized_max - quantized_min);
const float kEps = 1e-6;
if (out_min < -kEps && out_max > kEps) {
float quantized_zero = -out_min / *scale;
int32_t
quantized_zero_near_int = static_cast<int32_t>(roundf(quantized_zero));
*zero_point = quantized_zero_near_int;
if (fabs(quantized_zero - quantized_zero_near_int) > kEps) {
if (quantized_zero < quantized_zero_near_int || non_zero) {
// keep out_max fixed, and move out_min
*zero_point = static_cast<int32_t>(std::ceil(quantized_zero));
*scale = out_max / (quantized_max - *zero_point);
} else {
// keep out_min fixed, and move out_max
*scale = out_min / (quantized_min - *zero_point);
}
}
} else if (out_min > -kEps) {
*zero_point = quantized_min;
} else {
*zero_point = quantized_max;
}
}
template<typename T>
inline T Saturate(float value) {
int rounded_value = static_cast<int>(value);
if (rounded_value <= std::numeric_limits<T>::lowest()) {
return std::numeric_limits<T>::lowest();
} else if (rounded_value >= std::numeric_limits<T>::max()) {
return std::numeric_limits<T>::max();
} else {
return static_cast<T>(rounded_value);
}
}
inline void FindMinMax(const float *input,
const index_t size,
float *min_val, float *max_val) {
float max_v = std::numeric_limits<float>::lowest();
float min_v = std::numeric_limits<float>::max();
for (index_t i = 0; i < size; ++i) {
max_v = std::max(max_v, input[i]);
min_v = std::min(min_v, input[i]);
}
*min_val = min_v;
*max_val = max_v;
}
template<typename T>
inline void QuantizeWithScaleAndZeropoint(const float *input,
const index_t size,
float scale,
int32_t zero_point,
T *output) {
float recip_scale = 1 / scale;
#pragma omp parallel for
for (int i = 0; i < size; ++i) {
output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
}
}
template<typename T>
inline void Quantize(const float *input,
const index_t size,
bool non_zero,
T *output,
float *scale,
int32_t *zero_point) {
float in_min_data;
float in_max_data;
FindMinMax(input, size, &in_min_data, &in_max_data);
AdjustRange<T>(in_min_data, in_max_data, non_zero,
scale, zero_point);
QuantizeWithScaleAndZeropoint(input, size, *scale, *zero_point, output);
}
template<typename T>
inline void Dequantize(const T *input,
const index_t size,
const float scale,
const int32_t zero_point,
float *output) {
#pragma omp parallel for
for (int i = 0; i < size; ++i) {
output[i] = scale * (input[i] - zero_point);
}
}
inline void QuantizeMultiplier(double multiplier,
int32_t* output_multiplier,
int32_t* shift) {
if (multiplier == 0.f) {
*output_multiplier = 0;
*shift = 0;
return;
}
const double q = std::frexp(multiplier, shift);
auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
if (qint == (1ll << 31)) {
qint /= 2;
++*shift;
}
*output_multiplier = static_cast<int32_t>(qint);
MACE_CHECK(*output_multiplier <= std::numeric_limits<int32_t>::max());
}
inline void GetOutputMultiplierAndShift(
const float lhs_scale, const float rhs_scale, const float output_scale,
int32_t *quantized_multiplier, int *right_shift) {
float real_multiplier = lhs_scale * rhs_scale / output_scale;
MACE_CHECK(real_multiplier > 0.f && real_multiplier < 1.f, real_multiplier);
int exponent;
QuantizeMultiplier(real_multiplier, quantized_multiplier, &exponent);
*right_shift = -exponent;
MACE_CHECK(*right_shift >= 0);
}
} // namespace mace
#endif // MACE_UTILS_QUANTIZE_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册