Distillation: distill_lambda: 1.0 distill_loss: l2_loss distill_node_pair: - teacher_tmp_9 - tmp_9 - teacher_tmp_12 - tmp_12 - teacher_tmp_15 - tmp_15 - teacher_tmp_18 - tmp_18 - teacher_tmp_21 - tmp_21 - teacher_tmp_24 - tmp_24 - teacher_tmp_27 - tmp_27 - teacher_tmp_30 - tmp_30 - teacher_tmp_33 - tmp_33 - teacher_tmp_36 - tmp_36 - teacher_tmp_39 - tmp_39 - teacher_tmp_42 - tmp_42 - teacher_linear_147.tmp_1 - linear_147.tmp_1 merge_feed: true teacher_model_dir: ../auto-compression_origin/static_bert_models teacher_model_filename: bert.pdmodel teacher_params_filename: bert.pdiparams Quantization: activation_bits: 8 is_full_quantize: false not_quant_pattern: - skip_quant quantize_op_types: - conv2d - depthwise_conv2d - mul - matmul weight_bits: 8 TrainConfig: epochs: 3 eval_iter: 1000 learning_rate: 0.0001 optimizer: SGD optim_args: weight_decay: 4.0e-05 origin_metric: 0.93