bert.py 4.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
from .transformer_encoder import EncoderLayer


class BertModelLayer(Layer):
W
wanghaoshuang 已提交
30
    def __init__(self,
W
wanghaoshuang 已提交
31 32
                 emb_size=128,
                 hidden_size=768,
W
wanghaoshuang 已提交
33 34 35 36 37 38
                 n_layer=12,
                 voc_size=30522,
                 max_position_seq_len=512,
                 sent_types=2,
                 return_pooled_out=True,
                 initializer_range=1.0,
W
wanghaoshuang 已提交
39 40
                 conv_type="conv_bn",
                 search_layer=True,
W
wanghaoshuang 已提交
41
                 use_fp16=False):
42 43
        super(BertModelLayer, self).__init__()

W
wanghaoshuang 已提交
44
        self._emb_size = emb_size
W
wanghaoshuang 已提交
45
        self._hidden_size = hidden_size
W
wanghaoshuang 已提交
46 47 48 49
        self._n_layer = n_layer
        self._voc_size = voc_size
        self._max_position_seq_len = max_position_seq_len
        self._sent_types = sent_types
50 51
        self.return_pooled_out = return_pooled_out

W
wanghaoshuang 已提交
52 53 54
        self._word_emb_name = "s_word_embedding"
        self._pos_emb_name = "s_pos_embedding"
        self._sent_emb_name = "s_sent_embedding"
55 56
        self._dtype = "float16" if use_fp16 else "float32"

W
wanghaoshuang 已提交
57 58
        self._conv_type = conv_type
        self._search_layer = search_layer
59
        self._param_initializer = fluid.initializer.TruncatedNormal(
W
wanghaoshuang 已提交
60
            scale=initializer_range)
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

        self._src_emb = Embedding(
            size=[self._voc_size, self._emb_size],
            param_attr=fluid.ParamAttr(
                name=self._word_emb_name, initializer=self._param_initializer),
            dtype=self._dtype)

        self._pos_emb = Embedding(
            size=[self._max_position_seq_len, self._emb_size],
            param_attr=fluid.ParamAttr(
                name=self._pos_emb_name, initializer=self._param_initializer),
            dtype=self._dtype)

        self._sent_emb = Embedding(
            size=[self._sent_types, self._emb_size],
            param_attr=fluid.ParamAttr(
                name=self._sent_emb_name, initializer=self._param_initializer),
            dtype=self._dtype)

W
wanghaoshuang 已提交
80
        self._emb_fac = Linear(
81
            input_dim=self._emb_size,
W
wanghaoshuang 已提交
82 83 84 85 86 87
            output_dim=self._hidden_size,
            param_attr=fluid.ParamAttr(name="s_emb_factorization"))

        self.pooled_fc = Linear(
            input_dim=self._hidden_size,
            output_dim=self._hidden_size,
88
            param_attr=fluid.ParamAttr(
W
wanghaoshuang 已提交
89 90
                name="s_pooled_fc.w_0", initializer=self._param_initializer),
            bias_attr="s_pooled_fc.b_0",
91 92 93
            act="tanh")

        self._encoder = EncoderLayer(
W
wanghaoshuang 已提交
94 95 96 97
            n_layer=self._n_layer,
            hidden_size=self._hidden_size,
            conv_type=self._conv_type,
            search_layer=self._search_layer)
98

99 100 101 102 103 104
    def max_flops(self):
        return self._encoder.max_flops

    def max_model_size(self):
        return self._encoder.max_model_size

W
wanghaoshuang 已提交
105
    def arch_parameters(self):
W
wanghaoshuang 已提交
106
        return [self._encoder.alphas, self._encoder.k]
W
wanghaoshuang 已提交
107

108 109 110 111 112
    def forward(self,
                src_ids,
                position_ids,
                sentence_ids,
                flops=[],
W
wanghaoshuang 已提交
113 114 115
                model_size=[],
                alphas=None,
                k=None):
116 117 118 119 120 121 122
        src_emb = self._src_emb(src_ids)
        pos_emb = self._pos_emb(position_ids)
        sent_emb = self._sent_emb(sentence_ids)

        emb_out = src_emb + pos_emb
        emb_out = emb_out + sent_emb

W
wanghaoshuang 已提交
123
        emb_out = self._emb_fac(emb_out)
124
        enc_outputs, k_i = self._encoder(
W
wanghaoshuang 已提交
125
            emb_out, flops=flops, model_size=model_size, alphas=alphas, k=k)
126 127 128 129 130 131 132 133 134

        if not self.return_pooled_out:
            return enc_outputs
        next_sent_feats = []
        for enc_output in enc_outputs:
            next_sent_feat = fluid.layers.slice(
                input=enc_output, axes=[1], starts=[0], ends=[1])
            next_sent_feat = self.pooled_fc(next_sent_feat)
            next_sent_feat = fluid.layers.reshape(
W
wanghaoshuang 已提交
135
                next_sent_feat, shape=[-1, self._hidden_size])
136 137
            next_sent_feats.append(next_sent_feat)

138
        return enc_outputs, next_sent_feats, k_i