# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import logging import platform from ..common import get_logger from .utils.predict import predict_compressed_model, with_variable_shape from .strategy_config import * from paddleslim.analysis import TableLatencyPredictor _logger = get_logger(__name__, level=logging.INFO) __all__ = [ "prepare_strategy", "create_strategy_config", "get_final_quant_config" ] # config tester to test the loss of quant_post hpo_config_tester = { "ptq_algo": ["avg", "mse", "KL"], "weight_quantize_type": ['channel_wise_abs_max', 'abs_max'], "bias_correct": [False], "batch_num": [5], "max_quant_count": 1, } # default hpo config default_hpo_config = { "ptq_algo": ["KL", "hist", "avg", "mse"], "weight_quantize_type": ['channel_wise_abs_max', 'abs_max'], "bias_correct": [True, False], "hist_percent": [0.98, 0.999], "batch_num": [10, 30], "max_quant_count": 20, } # default quant config, can be used by ptq&hpo and qat&distillation default_quant_config = { 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul', 'matmul', 'matmul_v2'], 'weight_bits': 8, 'activation_bits': 8, "is_full_quantize": False, "activation_quantize_type": 'moving_average_abs_max', "weight_quantize_type": 'channel_wise_abs_max', "not_quant_pattern": ["skip_quant"], } # default train config DefaultTrainConfig = { "epochs": 1, "eval_iter": 500, "learning_rate": 0.0001, "optimizer_builder": { "optimizer": { "type": "Momentum", }, "weight_decay": 4.0e-05 } } EXPERIENCE_STRATEGY_WITHOUT_LOSS = [ 'sparse_0.75_fp32', 'prune_0.3_fp32', 'origin_int8', 'sparse_0.75_int8', 'prune_0.3_int8' ] MAGIC_SPARSE_RATIO = 0.75 ### TODO: 0.02 threshold maybe not suitable, need to check ### NOTE: reduce magic data to choose quantization aware training. MAGIC_MAX_EMD_DISTANCE = 0.00002 #0.02 MAGIC_MIN_EMD_DISTANCE = 0.00001 #0.01 DEFAULT_TRANSFORMER_STRATEGY = 'prune_0.25_int8' DEFAULT_STRATEGY = 'origin_int8' DEFAULT_QUANT_SPEEDUP = 0.7 def create_strategy_config(strategy_str, model_type): """ create config according to string""" tmp_s = strategy_str.split('_') configs = [] dis_config = Distillation() if len(tmp_s) == 3: ### TODO(ceci3): choose prune algo automatically if 'prune' in tmp_s[0]: ### default prune config default_prune_config = { 'pruned_ratio': float(tmp_s[1]), 'criterion': 'l1_norm' } else: ### default unstruture prune config default_prune_config = { 'prune_strategy': 'gmp', ### default unstruture prune strategy is gmp 'prune_mode': 'ratio', 'ratio': float(tmp_s[1]), 'local_sparsity': True, 'prune_params_type': 'conv1x1_only' } if model_type == 'transformer': tmp_s[0] = tmp_s[0].replace('prune', 'TransformerPrune') default_prune_config = {'pruned_ratio': float(tmp_s[1])} else: tmp_s[0] = tmp_s[0].replace('prune', 'Prune') tmp_s[0] = tmp_s[0].replace('sparse', 'UnstructurePrune') prune_config = eval(tmp_s[0])(**default_prune_config) configs.append({tmp_s[0]: prune_config, 'Distillation': dis_config}) ### TODO(ceci3): support skip some layer and full quant if tmp_s[-1] == 'int8': ### only platform is linux can use smac to do hyperparameter optimization ### choose quant_aware to do quantization in other platform if platform.system().lower() == 'linux': quant_config = Quantization(**default_quant_config) hpo_config = HyperParameterOptimization(**hpo_config_tester) configs.append({ 'Quantization': quant_config, 'HyperParameterOptimization': hpo_config }) else: quant_config = Quantization(**default_quant_config) dis_config = Distillation() configs.append({ 'Quantization': quant_config, 'Distillation': dis_config }) return configs def create_train_config(strategy_str, model_type): # TDOD: support more strategy and model_type train_config = TrainConfig(**DefaultTrainConfig) return train_config def prepare_strategy(executor, places, model_dir, model_filename, params_filename, target_speedup=None, deploy_hardware=None, model_type=None): """ prepare compression config automatically """ final_strategy = None ### use hardware latency tabel if support if not with_variable_shape( model_dir, model_filename=model_filename, params_filename=params_filename) and ( deploy_hardware in TableLatencyPredictor.hardware_list): compressed_time_dict = predict_compressed_model( executor, places, model_dir, model_filename, params_filename, hardware=deploy_hardware) baseline = compressed_time_dict['origin_fp32'] speedup_ratio = {} for strategy, latency in compressed_time_dict.items(): speedup_ratio[strategy] = 1.0 - float(latency) / baseline sorted_speedup_ratio = sorted(speedup_ratio.items(), key=lambda x: x[1]) ### if target speedup is None, choose strategy by experience. if target_speedup is None: max_speedup = -1.0 for s in EXPERIENCE_STRATEGY_WITHOUT_LOSS: if s not in speedup_ratio: _logger.info(f"cannot get the speed up of strategy {s}") continue if speedup_ratio[s] > max_speedup: max_speedup = speedup_ratio[s] final_strategy = s else: candidate_s = [] pre_s = None for strategy, ratio in sorted_speedup_ratio: if abs(ratio - target_speedup) <= 0.1: candidate_s.append(strategy) ### if there is no strategy satisfy target speedup ### choose the most recent speedup if ratio > target_speedup and len(candidate_s) == 0: if pre_s is not None: candidate_s.append(pre_s) candidate_s.append(strategy) pre_s = strategy if 'origin_int8' in candidate_s: final_strategy = candidate_s else: candidate_s = sorted(candidate_s, key=lambda x: x.split('_')[1]) for c in candidate_s: if c.startswith('sparse') and float(c.split('_')[ 1]) <= MAGIC_SPARSE_RATIO: final_strategy = c if final_strategy is None: final_strategy = candidate_s[0] else: ### default speedup ratio of quantization is 70% compare to fp32 ### TODO(ceci3): full quant or skip some layer later if target_speedup is None: if model_type == 'transformer': final_strategy = DEFAULT_TRANSFORMER_STRATEGY else: final_strategy = DEFAULT_STRATEGY elif target_speedup > DEFAULT_QUANT_SPEEDUP: prune_ratio = target_speedup - DEFAULT_QUANT_SPEEDUP if prune_ratio > 1.0: raise NotImplementedError( "target_speedup {} is improper".format(target_speedup)) final_strategy = 'prune_{}_int8'.format(str(prune_ratio)) else: raise NotImplementedError("target_speedup {} is improper".format( target_speedup)) strategy_config = create_strategy_config(final_strategy, model_type) return strategy_config def get_final_quant_config(ptq_loss, model_type=None): """ transform quantization tester config to real quantization config """ ### if emd loss less than MAGIC_MIN_EMD_DISTANCE, final compress. if ptq_loss < MAGIC_MIN_EMD_DISTANCE: return None ### if emd loss less than MAGIC_MAX_EMD_DISTANCE, select quant_post & hpo. elif ptq_loss < MAGIC_MAX_EMD_DISTANCE: quant_config = Quantization(**default_quant_config) hpo_config = HyperParameterOptimization(**default_hpo_config) configs = [{ 'Quantization': quant_config, 'HyperParameterOptimization': hpo_config }] ### if emd loss greater than MAGIC_MAX_EMD_DISTANCE, select qat & dist. else: quant_config = Quantization(**default_quant_config) dis_config = Distillation() configs = [{'Quantization': quant_config, 'Distillation': dis_config}] _logger.info("Start Quantization and Distillation Training.") return configs if __name__ == '__main__': create_strategy_config('sparse_0.75_int8', 'transformer')