diff --git a/paddleslim/teachers/bert/utils/init.py b/paddleslim/teachers/bert/utils/init.py new file mode 100644 index 0000000000000000000000000000000000000000..52f9b38082fd79258c292c9970e3d65ffb9a2d52 --- /dev/null +++ b/paddleslim/teachers/bert/utils/init.py @@ -0,0 +1,245 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import six +import ast +import copy + +import numpy as np +import paddle.fluid as fluid + + +def cast_fp32_to_fp16(exe, main_program): + print("Cast parameters to float16 data format.") + for param in main_program.global_block().all_parameters(): + if not param.name.endswith(".master"): + param_t = fluid.global_scope().find_var(param.name).get_tensor() + data = np.array(param_t) + if param.name.find("layer_norm") == -1: + param_t.set(np.float16(data).view(np.uint16), exe.place) + master_param_var = fluid.global_scope().find_var(param.name + + ".master") + if master_param_var is not None: + master_param_var.get_tensor().set(data, exe.place) + + +def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + if not fluid.io.is_persistable(var): + return False + return os.path.exists(os.path.join(init_checkpoint_path, var.name)) + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + print("Load model from {}".format(init_checkpoint_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(pretraining_params_path, var.name)) + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=existed_params) + print("Load pretraining parameters from {}.".format( + pretraining_params_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) + + +def init_from_static_model(dir_path, cls_model, bert_config): + def load_numpy_weight(file_name): + if six.PY2: + res = np.load(os.path.join(dir_path, file_name), allow_pickle=True) + else: + res = np.load( + os.path.join(dir_path, file_name), + allow_pickle=True, + encoding='latin1') + assert res is not None + return res + + # load word embedding + _param = load_numpy_weight("word_embedding") + cls_model.bert_layer._src_emb.set_dict({"weight": _param}) + print("INIT word embedding") + + _param = load_numpy_weight("pos_embedding") + cls_model.bert_layer._pos_emb.set_dict({"weight": _param}) + print("INIT pos embedding") + + _param = load_numpy_weight("sent_embedding") + cls_model.bert_layer._sent_emb.set_dict({"weight": _param}) + print("INIT sent embedding") + + _param0 = load_numpy_weight("pooled_fc.w_0") + _param1 = load_numpy_weight("pooled_fc.b_0") + cls_model.bert_layer.pooled_fc.set_dict({ + "weight": _param0, + "bias": _param1 + }) + print("INIT pooled_fc") + + _param0 = load_numpy_weight("pre_encoder_layer_norm_scale") + _param1 = load_numpy_weight("pre_encoder_layer_norm_bias") + cls_model.bert_layer.pre_process_layer._sub_layers[ + "layer_norm_0"].set_dict({ + "weight": _param0, + "bias": _param1 + }) + print("INIT pre_encoder layer norm") + + for _i in range(bert_config["num_hidden_layers"]): + _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i + _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._multihead_attention_layer._q_fc.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT multi_head_att_query_fc %d" % _i) + + _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i + _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._multihead_attention_layer._k_fc.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT multi_head_att_key_fc %d" % _i) + + _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i + _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._multihead_attention_layer._v_fc.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT multi_head_att_value_fc %d" % _i) + + # init output fc + _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i + _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._multihead_attention_layer._proj_fc.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT multi_head_att_output_fc %d" % _i) + + # init layer_norm 1 + _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i + _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._postprocess_layer.layer_norm_0.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT layer norm in attention at %d layer" % _i) + + # init layer_norm 2 + _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i + _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._postprocess_layer2.layer_norm_0.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT layer norm in FFN at %d layer" % _i) + + # init FFN 1 + _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i + _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._positionwise_feed_forward._i2h.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT FFN-1 at %d layer" % _i) + + # init FFN 2 + _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i + _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i + + _param_weight = load_numpy_weight(_param_weight) + _param_bias = load_numpy_weight(_param_bias) + + cls_model.bert_layer._encoder._sub_layers[ + "esl_%d" % _i]._positionwise_feed_forward._h2o.set_dict({ + "weight": _param_weight, + "bias": _param_bias + }) + print("INIT FFN-2 at %d layer" % _i) + + # init cls fc + #_param_weight = "cls_out_w" + #_param_bias = "cls_out_b" + + #_param_weight = load_numpy_weight(_param_weight) + #_param_bias = load_numpy_weight(_param_bias) + + #cls_model.cls_fc.set_dict({"weight":_param_weight, "bias":_param_bias}) + #print("INIT CLS FC layer") + return True