# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file is used to finetune. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy import paddle.fluid.layers as layers import paddle.fluid as fluid import numpy as np # if you use our release weight layers,do not use the args. cell_clip = 3.0 proj_clip = 3.0 hidden_size = 4096 vocab_size = 52445 embed_size = 512 # according to orginal paper, dropout need to be modifyed on finetune modify_dropout = 1 proj_size = 512 num_layers = 2 random_seed = 0 dropout_rate = 0.5 def dropout(input): return layers.dropout( input, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", seed=random_seed, is_test=False) def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name): # A lstm encoder implementation with projection. # Linear transformation part for input gate, output gate, forget gate # and cell activation vectors need be done outside of dynamic_lstm. # So the output size is 4 times of gate_size. input_proj = layers.fc(input=input_seq, param_attr=fluid.ParamAttr( name=para_name + '_gate_w', initializer=init), size=gate_size * 4, act=None, bias_attr=False) hidden, cell = layers.dynamic_lstmp( input=input_proj, size=gate_size * 4, proj_size=proj_size, h_0=h_0, c_0=c_0, use_peepholes=False, proj_clip=proj_clip, cell_clip=cell_clip, proj_activation="identity", param_attr=fluid.ParamAttr(initializer=None), bias_attr=fluid.ParamAttr(initializer=None)) return hidden, cell, input_proj def encoder(x_emb, init_hidden=None, init_cell=None, para_name=''): rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(num_layers): if init_hidden and init_cell: h0 = layers.squeeze( layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze( layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1)) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out.stop_gradient = True rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) # add weight layers for finetone a1 = layers.create_parameter( [1], dtype="float32", name="gamma1") a2 = layers.create_parameter( [1], dtype="float32", name="gamma2") rnn_outs[0].stop_gradient = True rnn_outs[1].stop_gradient = True num_layer1 = rnn_outs[0] * a1 num_layer2 = rnn_outs[1] * a2 output_layer = num_layer1 * 0.5 + num_layer2 * 0.5 return output_layer, rnn_outs_ori def emb(x): x_emb = layers.embedding( input=x, size=[vocab_size, embed_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) return x_emb def elmo_encoder(x_emb): x_emb_r = fluid.layers.sequence_reverse(x_emb, name=None) fw_hiddens, fw_hiddens_ori = encoder( x_emb, para_name='fw_') bw_hiddens, bw_hiddens_ori = encoder( x_emb_r, para_name='bw_') embedding = layers.concat(input=[fw_hiddens, bw_hiddens], axis=1) # add dropout on finetune embedding = dropout(embedding) a = layers.create_parameter( [1], dtype="float32", name="gamma") embedding = embedding * a return embedding