“b079577e08f8b5ce7f8aa1b67e2209418be7ebc6”上不存在“examples/aishell3/vc2/local/synthesize.sh”
提交 a157a65f 编写于 作者: 李寅

Merge branch 'compress' into 'master'

Add model compression

See merge request !1088
...@@ -504,3 +504,25 @@ which will reduce the library size significantly. the final binary just link the ...@@ -504,3 +504,25 @@ which will reduce the library size significantly. the final binary just link the
} }
} // namespace mace } // namespace mace
Reduce Model Size
-------------------
Model file size can be a bottleneck for the deployment of neural networks on mobile devices,
so MACE provides several ways to reduce the model size with no or little performance or accuracy degradation.
**1. Save model weights in half-precision floating point format**
The default data type of a regular model is float (32bit). To reduce the model weights size,
half (16bit) can be used to reduce it by half with negligible accuracy degradation.
For CPU, ``data_type`` can be specified as ``fp16_fp32`` in the deployment file to save the weights in half and actual inference in float.
For GPU, ``fp16_fp32`` is default. The ops in GPU take half as inputs and outputs while kernel execution in float.
**2. Save model weights in quantized fixed point format**
Weights of convolutional (excluding depthwise) and fully connected layers take up a major part of model size.
These weights can be quantized to 8bit to reduce the size to a quarter, whereas the accuracy usually decreases only by 1%-3%.
For example, the top-1 accuracy of MobileNetV1 after quantization of weights is 68.2% on the ImageNet validation set.
``quantize_large_weights`` can be specified as 1 in the deployment file to save these weights in 8bit and actual inference in float.
It can be used for both CPU and GPU.
...@@ -104,9 +104,9 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -104,9 +104,9 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
if (model_data_size > 0) { if (model_data_size > 0) {
bool is_quantize_model = IsQuantizedModel(net_def); bool is_quantize_model = IsQuantizedModel(net_def);
diffused_buffer_ = (device_type == DeviceType::CPU && diffused_buffer_ =
(HasHalfTensor(net_def) || (device_type == DeviceType::CPU && HasHalfTensor(net_def)) ||
(!is_quantize_model && HasQuantizedTensor(net_def)))); (!is_quantize_model && HasQuantizedTensor(net_def));
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
diffused_buffer_ = diffused_buffer_ || (device_type == DeviceType::GPU && diffused_buffer_ = diffused_buffer_ || (device_type == DeviceType::GPU &&
device->gpu_runtime()->opencl_runtime()->GetDeviceMaxMemAllocSize() <= device->gpu_runtime()->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
...@@ -125,8 +125,9 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -125,8 +125,9 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} }
DataType dst_data_type = const_tensor.data_type(); DataType dst_data_type = const_tensor.data_type();
if (device_type == DeviceType::CPU && if ((device_type == DeviceType::CPU &&
const_tensor.data_type() == DataType::DT_HALF) { const_tensor.data_type() == DataType::DT_HALF) ||
(!is_quantize_model && const_tensor.quantized())) {
dst_data_type = DataType::DT_FLOAT; dst_data_type = DataType::DT_FLOAT;
} }
...@@ -147,8 +148,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -147,8 +148,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
") should <= ", ") should <= ",
model_data_size); model_data_size);
if (device_type == DeviceType::CPU) { if (device_type == DeviceType::CPU &&
if (const_tensor.data_type() == DataType::DT_HALF) { const_tensor.data_type() == DataType::DT_HALF) {
// uncompress the weights of fp16 // uncompress the weights of fp16
auto org_data = reinterpret_cast<const half *>( auto org_data = reinterpret_cast<const half *>(
model_data + const_tensor.offset()); model_data + const_tensor.offset());
...@@ -156,25 +157,19 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -156,25 +157,19 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
for (int i = 0; i < const_tensor.data_size(); ++i) { for (int i = 0; i < const_tensor.data_size(); ++i) {
dst_data[i] = half_float::half_cast<float>(org_data[i]); dst_data[i] = half_float::half_cast<float>(org_data[i]);
} }
} else if (!is_quantize_model && const_tensor.quantized()) { } else if (!is_quantize_model && const_tensor.quantized()) {
// uncompress the weights of uint8 // uncompress the weights of uint8
std::unique_ptr<Tensor> dequantized_tensor(new Tensor(true)); Tensor::MappingGuard guard(tensor.get());
dequantized_tensor->Resize(dims); auto quantized_data = reinterpret_cast<const uint8_t *>(
auto quantized_data = reinterpret_cast<const uint8_t *>( model_data + const_tensor.offset());
model_data + const_tensor.offset()); auto dequantized_data = tensor->mutable_data<float>();
auto dequantized_data = tensor->mutable_data<float>(); QuantizeUtil<uint8_t>
QuantizeUtil<uint8_t> quantize_util(&device->cpu_runtime()->thread_pool());
quantize_util(&device->cpu_runtime()->thread_pool()); quantize_util.Dequantize(quantized_data,
quantize_util.Dequantize(quantized_data, tensor->size(),
tensor->size(), const_tensor.scale(),
const_tensor.scale(), const_tensor.zero_point(),
const_tensor.zero_point(), dequantized_data);
dequantized_data);
} else {
tensor->CopyBytes(model_data + const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type()));
}
} else { } else {
tensor->CopyBytes(model_data + const_tensor.offset(), tensor->CopyBytes(model_data + const_tensor.offset(),
const_tensor.data_size() * const_tensor.data_size() *
......
...@@ -142,6 +142,7 @@ def main(unused_args): ...@@ -142,6 +142,7 @@ def main(unused_args):
option.transformer_option = FLAGS.graph_optimize_options.split(',') option.transformer_option = FLAGS.graph_optimize_options.split(',')
option.winograd = FLAGS.winograd option.winograd = FLAGS.winograd
option.quantize = FLAGS.quantize option.quantize = FLAGS.quantize
option.quantize_large_weights = FLAGS.quantize_large_weights
option.quantize_range_file = FLAGS.quantize_range_file option.quantize_range_file = FLAGS.quantize_range_file
option.change_concat_ranges = FLAGS.change_concat_ranges option.change_concat_ranges = FLAGS.change_concat_ranges
option.cl_mem_type = FLAGS.cl_mem_type option.cl_mem_type = FLAGS.cl_mem_type
...@@ -389,6 +390,13 @@ def parse_args(): ...@@ -389,6 +390,13 @@ def parse_args():
const=False, const=False,
default=False, default=False,
help="quantize model") help="quantize model")
parser.add_argument(
"--quantize_large_weights",
type=str2bool,
nargs='?',
const=False,
default=False,
help="quantize large weights for compression")
parser.add_argument( parser.add_argument(
"--quantize_range_file", "--quantize_range_file",
type=str, type=str,
......
...@@ -320,6 +320,7 @@ class TransformerRule(Enum): ...@@ -320,6 +320,7 @@ class TransformerRule(Enum):
QUANTIZE_SPECIFIC_OPS_ONLY = 40 QUANTIZE_SPECIFIC_OPS_ONLY = 40
FP16_MATMUL_WEIGHT = 41 FP16_MATMUL_WEIGHT = 41
FP16_GATHER_WEIGHT = 42 FP16_GATHER_WEIGHT = 42
QUANTIZE_LARGE_WEIGHTS = 43
class ConverterInterface(object): class ConverterInterface(object):
...@@ -394,6 +395,7 @@ class ConverterOption(object): ...@@ -394,6 +395,7 @@ class ConverterOption(object):
self._device = DeviceType.CPU.value self._device = DeviceType.CPU.value
self._winograd = 0 self._winograd = 0
self._quantize = False self._quantize = False
self._quantize_large_weights = False
self._quantize_range_file = "" self._quantize_range_file = ""
self._change_concat_ranges = False self._change_concat_ranges = False
self._transformer_option = None self._transformer_option = None
...@@ -427,6 +429,10 @@ class ConverterOption(object): ...@@ -427,6 +429,10 @@ class ConverterOption(object):
def quantize(self): def quantize(self):
return self._quantize return self._quantize
@property
def quantize_large_weights(self):
return self._quantize_large_weights
@property @property
def change_concat_ranges(self): def change_concat_ranges(self):
return self._change_concat_ranges return self._change_concat_ranges
...@@ -483,6 +489,10 @@ class ConverterOption(object): ...@@ -483,6 +489,10 @@ class ConverterOption(object):
def quantize(self, quantize): def quantize(self, quantize):
self._quantize = quantize self._quantize = quantize
@quantize_large_weights.setter
def quantize_large_weights(self, quantize_large_weights):
self._quantize_large_weights = quantize_large_weights
@quantize_range_file.setter @quantize_range_file.setter
def quantize_range_file(self, quantize_range_file): def quantize_range_file(self, quantize_range_file):
self._quantize_range_file = quantize_range_file self._quantize_range_file = quantize_range_file
...@@ -558,6 +568,10 @@ class ConverterOption(object): ...@@ -558,6 +568,10 @@ class ConverterOption(object):
# Need to be put after SORT_BY_EXECUTION # Need to be put after SORT_BY_EXECUTION
TransformerRule.ADD_QUANTIZE_TENSOR_RANGE, TransformerRule.ADD_QUANTIZE_TENSOR_RANGE,
] ]
if self.quantize_large_weights:
self._transformer_option = self._transformer_option + [
TransformerRule.QUANTIZE_LARGE_WEIGHTS
]
if self._quantize: if self._quantize:
self._transformer_option = self._transformer_option + [ self._transformer_option = self._transformer_option + [
# need to be put after ADD_QUANTIZE_TENSOR_RANGE # need to be put after ADD_QUANTIZE_TENSOR_RANGE
......
...@@ -110,6 +110,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -110,6 +110,8 @@ class Transformer(base_converter.ConverterInterface):
self.fp16_matmul_weight, self.fp16_matmul_weight,
TransformerRule.FP16_GATHER_WEIGHT: TransformerRule.FP16_GATHER_WEIGHT:
self.fp16_gather_weight, self.fp16_gather_weight,
TransformerRule.QUANTIZE_LARGE_WEIGHTS:
self.quantize_large_weights,
} }
self._option = option self._option = option
...@@ -1625,6 +1627,35 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1625,6 +1627,35 @@ class Transformer(base_converter.ConverterInterface):
return False return False
def quantize_large_tensor(self, tensor):
if tensor.data_type == mace_pb2.DT_FLOAT:
ops = self._consumers.get(tensor.name, None)
if ops is not None and len(ops) == 1:
if ops[0].type in [MaceOp.Conv2D.name,
MaceOp.FullyConnected.name]:
quantized_tensor = \
quantize_util.quantize(tensor.float_data,
self._option.device,
False)
tensor.data_type = mace_pb2.DT_UINT8
del tensor.float_data[:]
tensor.int32_data.extend(quantized_tensor.data)
tensor.scale = quantized_tensor.scale
tensor.zero_point = quantized_tensor.zero
tensor.minval = quantized_tensor.minval
tensor.maxval = quantized_tensor.maxval
tensor.quantized = True
self._quantized_tensor.update([tensor.name])
def quantize_large_weights(self):
print("Quantize large weights")
net = self._model
for tensor in net.tensors:
self.quantize_large_tensor(tensor)
return False
def add_quantize_info(self, op, minval, maxval): def add_quantize_info(self, op, minval, maxval):
scale, zero, minval, maxval = \ scale, zero, minval, maxval = \
quantize_util.adjust_range(minval, maxval, self._option.device, quantize_util.adjust_range(minval, maxval, self._option.device,
......
...@@ -132,6 +132,9 @@ class DeviceType(object): ...@@ -132,6 +132,9 @@ class DeviceType(object):
HTA = 'HTA' HTA = 'HTA'
APU = 'APU' APU = 'APU'
# for validation threshold
QUANTIZE = 'QUANTIZE'
class DataFormat(object): class DataFormat(object):
NONE = "NONE" NONE = "NONE"
...@@ -408,6 +411,7 @@ class YAMLKeyword(object): ...@@ -408,6 +411,7 @@ class YAMLKeyword(object):
obfuscate = 'obfuscate' obfuscate = 'obfuscate'
winograd = 'winograd' winograd = 'winograd'
quantize = 'quantize' quantize = 'quantize'
quantize_large_weights = 'quantize_large_weights'
quantize_range_file = 'quantize_range_file' quantize_range_file = 'quantize_range_file'
change_concat_ranges = 'change_concat_ranges' change_concat_ranges = 'change_concat_ranges'
validation_inputs_data = 'validation_inputs_data' validation_inputs_data = 'validation_inputs_data'
......
...@@ -118,8 +118,7 @@ class DefaultValues(object): ...@@ -118,8 +118,7 @@ class DefaultValues(object):
class ValidationThreshold(object): class ValidationThreshold(object):
cpu_threshold = 0.999, cpu_threshold = 0.999,
gpu_threshold = 0.995, gpu_threshold = 0.995,
hexagon_threshold = 0.930, quantize_threshold = 0.980,
cpu_quantize_threshold = 0.980,
CPP_KEYWORDS = [ CPP_KEYWORDS = [
...@@ -501,12 +500,9 @@ def format_model_config(flags): ...@@ -501,12 +500,9 @@ def format_model_config(flags):
threshold_dict = { threshold_dict = {
DeviceType.CPU: ValidationThreshold.cpu_threshold, DeviceType.CPU: ValidationThreshold.cpu_threshold,
DeviceType.GPU: ValidationThreshold.gpu_threshold, DeviceType.GPU: ValidationThreshold.gpu_threshold,
DeviceType.HEXAGON + "_QUANTIZE": DeviceType.HEXAGON: ValidationThreshold.quantize_threshold,
ValidationThreshold.hexagon_threshold, DeviceType.HTA: ValidationThreshold.quantize_threshold,
DeviceType.HTA + "_QUANTIZE": DeviceType.QUANTIZE: ValidationThreshold.quantize_threshold,
ValidationThreshold.hexagon_threshold,
DeviceType.CPU + "_QUANTIZE":
ValidationThreshold.cpu_quantize_threshold,
} }
for k, v in six.iteritems(validation_threshold): for k, v in six.iteritems(validation_threshold):
if k.upper() == 'DSP': if k.upper() == 'DSP':
...@@ -515,7 +511,7 @@ def format_model_config(flags): ...@@ -515,7 +511,7 @@ def format_model_config(flags):
DeviceType.GPU, DeviceType.GPU,
DeviceType.HEXAGON, DeviceType.HEXAGON,
DeviceType.HTA, DeviceType.HTA,
DeviceType.CPU + "_QUANTIZE"): DeviceType.QUANTIZE):
raise argparse.ArgumentTypeError( raise argparse.ArgumentTypeError(
'Unsupported validation threshold runtime: %s' % k) 'Unsupported validation threshold runtime: %s' % k)
threshold_dict[k.upper()] = v threshold_dict[k.upper()] = v
...@@ -566,11 +562,18 @@ def format_model_config(flags): ...@@ -566,11 +562,18 @@ def format_model_config(flags):
YAMLKeyword.obfuscate, YAMLKeyword.obfuscate,
YAMLKeyword.winograd, YAMLKeyword.winograd,
YAMLKeyword.quantize, YAMLKeyword.quantize,
YAMLKeyword.quantize_large_weights,
YAMLKeyword.change_concat_ranges]: YAMLKeyword.change_concat_ranges]:
value = model_config.get(key, "") value = model_config.get(key, "")
if value == "": if value == "":
model_config[key] = 0 model_config[key] = 0
mace_check(model_config[YAMLKeyword.quantize] == 0 or
model_config[YAMLKeyword.quantize_large_weights] == 0,
ModuleName.YAML_CONFIG,
"quantize and quantize_large_weights should not be set to 1"
" at the same time.")
mace_check(model_config[YAMLKeyword.winograd] in WinogradParameters, mace_check(model_config[YAMLKeyword.winograd] in WinogradParameters,
ModuleName.YAML_CONFIG, ModuleName.YAML_CONFIG,
"'winograd' parameters must be in " "'winograd' parameters must be in "
...@@ -773,6 +776,7 @@ def convert_model(configs, cl_mem_type): ...@@ -773,6 +776,7 @@ def convert_model(configs, cl_mem_type):
embed_model_data, embed_model_data,
model_config[YAMLKeyword.winograd], model_config[YAMLKeyword.winograd],
model_config[YAMLKeyword.quantize], model_config[YAMLKeyword.quantize],
model_config[YAMLKeyword.quantize_large_weights],
quantize_range_file_path, quantize_range_file_path,
model_config[YAMLKeyword.change_concat_ranges], model_config[YAMLKeyword.change_concat_ranges],
model_config[YAMLKeyword.obfuscate], model_config[YAMLKeyword.obfuscate],
......
...@@ -730,8 +730,11 @@ class DeviceWrapper: ...@@ -730,8 +730,11 @@ class DeviceWrapper:
model_config[ model_config[
YAMLKeyword.weight_sha256_checksum]) YAMLKeyword.weight_sha256_checksum])
validate_type = device_type validate_type = device_type
if model_config[YAMLKeyword.quantize] == 1: if device_type in [DeviceType.CPU,
validate_type = device_type + '_QUANTIZE' DeviceType.GPU] and \
(model_config[YAMLKeyword.quantize] == 1 or
model_config[YAMLKeyword.quantize_large_weights] == 1): # noqa
validate_type = DeviceType.QUANTIZE
dockerfile_path, docker_image_tag = \ dockerfile_path, docker_image_tag = \
get_dockerfile_info( get_dockerfile_info(
......
...@@ -499,6 +499,7 @@ def gen_model_code(model_codegen_dir, ...@@ -499,6 +499,7 @@ def gen_model_code(model_codegen_dir,
embed_model_data, embed_model_data,
winograd, winograd,
quantize, quantize,
quantize_large_weights,
quantize_range_file, quantize_range_file,
change_concat_ranges, change_concat_ranges,
obfuscate, obfuscate,
...@@ -537,6 +538,7 @@ def gen_model_code(model_codegen_dir, ...@@ -537,6 +538,7 @@ def gen_model_code(model_codegen_dir,
"--embed_model_data=%s" % embed_model_data, "--embed_model_data=%s" % embed_model_data,
"--winograd=%s" % winograd, "--winograd=%s" % winograd,
"--quantize=%s" % quantize, "--quantize=%s" % quantize,
"--quantize_large_weights=%s" % quantize_large_weights,
"--quantize_range_file=%s" % quantize_range_file, "--quantize_range_file=%s" % quantize_range_file,
"--change_concat_ranges=%s" % change_concat_ranges, "--change_concat_ranges=%s" % change_concat_ranges,
"--obfuscate=%s" % obfuscate, "--obfuscate=%s" % obfuscate,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册