electra.py 10.0 KB
Newer Older
W
wuzewu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ELECTRA model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import json

import paddle.fluid as fluid

from chinese_electra_small.model.transformer_encoder import encoder, pre_process_layer


class ElectraConfig(object):
    def __init__(self, config_path):
        self._config_dict = self._parse(config_path)

    def _parse(self, config_path):
        try:
            with open(config_path) as json_file:
                config_dict = json.load(json_file)
        except Exception:
W
wuzewu 已提交
37
            raise IOError("Error in parsing electra model config file '%s'" % config_path)
W
wuzewu 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50
        else:
            return config_dict

    def __getitem__(self, key):
        return self._config_dict[key]

    def print_config(self):
        for arg, value in sorted(six.iteritems(self._config_dict)):
            print('%s: %s' % (arg, value))
        print('------------------------------------------------')


class ElectraModel(object):
W
wuzewu 已提交
51
    def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
W
wuzewu 已提交
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71

        self._emb_size = 128
        self._hidden_size = config['hidden_size']
        self._n_layer = config['num_hidden_layers']
        self._n_head = config['num_attention_heads']
        self._voc_size = config['vocab_size']
        self._max_position_seq_len = config['max_position_embeddings']
        self._sent_types = config['type_vocab_size']
        self._hidden_act = config['hidden_act']
        self._prepostprocess_dropout = config['hidden_dropout_prob']
        self._attention_dropout = config['attention_probs_dropout_prob']
        self._weight_sharing = weight_sharing

        self._word_emb_name = "word_embedding"
        self._pos_emb_name = "pos_embedding"
        self._sent_emb_name = "sent_embedding"
        self._dtype = "float16" if use_fp16 else "float32"

        # Initialize all weigths by truncated normal initializer, and all biases
        # will be initialized by constant zero by default.
W
wuzewu 已提交
72
        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
W
wuzewu 已提交
73 74 75 76 77

        self._build_model(src_ids, position_ids, sentence_ids, input_mask)

    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
        # padding id in vocabulary must be set to 0
G
grasswolfs 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
        emb_out = fluid.layers.embedding(input=src_ids,
                                         size=[self._voc_size, self._emb_size],
                                         dtype=self._dtype,
                                         param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                                                    initializer=self._param_initializer),
                                         is_sparse=False)
        position_emb_out = fluid.layers.embedding(input=position_ids,
                                                  size=[self._max_position_seq_len, self._emb_size],
                                                  dtype=self._dtype,
                                                  param_attr=fluid.ParamAttr(name=self._pos_emb_name,
                                                                             initializer=self._param_initializer))

        sent_emb_out = fluid.layers.embedding(sentence_ids,
                                              size=[self._sent_types, self._emb_size],
                                              dtype=self._dtype,
                                              param_attr=fluid.ParamAttr(name=self._sent_emb_name,
                                                                         initializer=self._param_initializer))
W
wuzewu 已提交
95 96 97 98

        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out

W
wuzewu 已提交
99
        emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
W
wuzewu 已提交
100 101

        if self._emb_size != self._hidden_size:
G
grasswolfs 已提交
102 103 104 105 106 107 108
            emb_out = fluid.layers.fc(input=emb_out,
                                      size=self._hidden_size,
                                      act=None,
                                      param_attr=fluid.ParamAttr(name="embeddings_project.w_0",
                                                                 initializer=self._param_initializer),
                                      num_flatten_dims=2,
                                      bias_attr="embeddings_project.b_0")
W
wuzewu 已提交
109 110 111 112

        if self._dtype == "float16":
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)

W
wuzewu 已提交
113 114 115
        self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
W
wuzewu 已提交
116 117
        n_head_self_attn_mask.stop_gradient = True

G
grasswolfs 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
        self._enc_out = encoder(enc_input=emb_out,
                                attn_bias=n_head_self_attn_mask,
                                n_layer=self._n_layer,
                                n_head=self._n_head,
                                d_key=self._hidden_size // self._n_head,
                                d_value=self._hidden_size // self._n_head,
                                d_model=self._hidden_size,
                                d_inner_hid=self._hidden_size * 4,
                                prepostprocess_dropout=self._prepostprocess_dropout,
                                attention_dropout=self._attention_dropout,
                                relu_dropout=0,
                                hidden_act=self._hidden_act,
                                preprocess_cmd="",
                                postprocess_cmd="dan",
                                param_initializer=self._param_initializer,
                                name='encoder')
W
wuzewu 已提交
134 135 136 137 138 139

    def get_sequence_output(self):
        return self._enc_out

    def get_pooled_output(self):
        """Get the first feature of each sequence for classification"""
W
wuzewu 已提交
140
        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
W
wuzewu 已提交
141 142 143 144 145 146 147 148 149
        return next_sent_feat

    def get_pretraining_output(self, mask_label, mask_pos, labels):
        """Get the loss & accuracy for pretraining"""

        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')

        # extract the first token feature in each sentence
        next_sent_feat = self.get_pooled_output()
W
wuzewu 已提交
150
        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._hidden_size])
W
wuzewu 已提交
151 152 153 154
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)

        # transform: fc
G
grasswolfs 已提交
155 156 157 158 159 160
        mask_trans_feat = fluid.layers.fc(input=mask_feat,
                                          size=self._hidden_size,
                                          act=self._hidden_act,
                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
                                                                     initializer=self._param_initializer),
                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
W
wuzewu 已提交
161
        # transform: layer norm
W
wuzewu 已提交
162
        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
W
wuzewu 已提交
163

G
grasswolfs 已提交
164 165
        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
                                                initializer=fluid.initializer.Constant(value=0.0))
W
wuzewu 已提交
166
        if self._weight_sharing:
G
grasswolfs 已提交
167 168 169 170 171 172 173
            fc_out = fluid.layers.matmul(x=mask_trans_feat,
                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
                                         transpose_y=True)
            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
                                                    dtype=self._dtype,
                                                    attr=mask_lm_out_bias_attr,
                                                    is_bias=True)
W
wuzewu 已提交
174 175

        else:
G
grasswolfs 已提交
176 177 178 179 180
            fc_out = fluid.layers.fc(input=mask_trans_feat,
                                     size=self._voc_size,
                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
                                                                initializer=self._param_initializer),
                                     bias_attr=mask_lm_out_bias_attr)
W
wuzewu 已提交
181

W
wuzewu 已提交
182
        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
W
wuzewu 已提交
183 184
        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)

G
grasswolfs 已提交
185 186 187 188 189
        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
                                           size=2,
                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
                                                                      initializer=self._param_initializer),
                                           bias_attr="next_sent_fc.b_0")
W
wuzewu 已提交
190

G
grasswolfs 已提交
191 192 193
        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
                                                                                    label=labels,
                                                                                    return_softmax=True)
W
wuzewu 已提交
194

W
wuzewu 已提交
195
        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
W
wuzewu 已提交
196 197 198 199 200

        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)

        loss = mean_next_sent_loss + mean_mask_lm_loss
        return next_sent_acc, mean_mask_lm_loss, loss