nets.py 10.4 KB
Newer Older
D
dzhwinter 已提交
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
import layers
F
fengjiayi 已提交
15

16 17 18
__all__ = [
    "simple_img_conv_pool",
    "sequence_conv_pool",
19
    "glu",
20
    "scaled_dot_product_attention",
21
]
D
dzhwinter 已提交
22

F
fengjiayi 已提交
23 24 25

def simple_img_conv_pool(input,
                         num_filters,
D
dzhwinter 已提交
26
                         filter_size,
F
fengjiayi 已提交
27 28 29
                         pool_size,
                         pool_stride,
                         act,
F
fengjiayi 已提交
30
                         param_attr=None,
C
chengduoZH 已提交
31
                         pool_type='max',
C
chengduoZH 已提交
32
                         use_cudnn=True):
F
fengjiayi 已提交
33 34 35 36
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
37
        param_attr=param_attr,
C
chengduoZH 已提交
38 39
        act=act,
        use_cudnn=use_cudnn)
F
fengjiayi 已提交
40 41 42 43

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
Q
Qiao Longfei 已提交
44
        pool_type=pool_type,
C
chengduoZH 已提交
45 46
        pool_stride=pool_stride,
        use_cudnn=use_cudnn)
Q
Qiao Longfei 已提交
47 48 49 50 51 52 53 54 55
    return pool_out


def img_conv_group(input,
                   conv_num_filter,
                   pool_size,
                   conv_padding=1,
                   conv_filter_size=3,
                   conv_act=None,
F
fengjiayi 已提交
56
                   param_attr=None,
Q
Qiao Longfei 已提交
57
                   conv_with_batchnorm=False,
W
wanghaoshuang 已提交
58
                   conv_batchnorm_drop_rate=0.0,
Q
Qiao Longfei 已提交
59
                   pool_stride=1,
C
chengduoZH 已提交
60
                   pool_type=None,
C
chengduoZH 已提交
61
                   use_cudnn=True):
Q
Qiao Longfei 已提交
62 63 64 65 66
    """
    Image Convolution Group, Used for vgg net.
    """
    tmp = input
    assert isinstance(conv_num_filter, list) or \
67
        isinstance(conv_num_filter, tuple)
Q
Qiao Longfei 已提交
68 69 70 71 72 73 74 75 76

    def __extend_list__(obj):
        if not hasattr(obj, '__len__'):
            return [obj] * len(conv_num_filter)
        else:
            return obj

    conv_padding = __extend_list__(conv_padding)
    conv_filter_size = __extend_list__(conv_filter_size)
F
fengjiayi 已提交
77
    param_attr = __extend_list__(param_attr)
Q
Qiao Longfei 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90
    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)

    for i in xrange(len(conv_num_filter)):
        local_conv_act = conv_act
        if conv_with_batchnorm[i]:
            local_conv_act = None

        tmp = layers.conv2d(
            input=tmp,
            num_filters=conv_num_filter[i],
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
F
fengjiayi 已提交
91
            param_attr=param_attr[i],
C
chengduoZH 已提交
92
            act=local_conv_act,
C
chengduoZH 已提交
93
            use_cudnn=use_cudnn)
Q
Qiao Longfei 已提交
94 95

        if conv_with_batchnorm[i]:
96
            tmp = layers.batch_norm(input=tmp, act=conv_act)
Q
Qiao Longfei 已提交
97 98
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
99
                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
Q
Qiao Longfei 已提交
100 101 102 103 104

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
C
chengduoZH 已提交
105
        pool_stride=pool_stride,
C
chengduoZH 已提交
106
        use_cudnn=use_cudnn)
F
fengjiayi 已提交
107
    return pool_out
D
dzhwinter 已提交
108 109 110 111 112


def sequence_conv_pool(input,
                       num_filters,
                       filter_size,
F
fengjiayi 已提交
113
                       param_attr=None,
114
                       act="sigmoid",
115
                       pool_type="max"):
D
dzhwinter 已提交
116 117 118 119
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
F
fengjiayi 已提交
120
        param_attr=param_attr,
121
        act=act)
D
dzhwinter 已提交
122

123
    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
D
dzhwinter 已提交
124
    return pool_out
G
guosheng 已提交
125 126 127 128


def glu(input, dim=-1):
    """
Y
ying 已提交
129 130 131
    The gated linear unit composed by split, sigmoid activation and elementwise
    multiplication. Specifically, Split the input into two equal sized parts
    :math:`a` and :math:`b` along the given dimension and then compute as
G
guosheng 已提交
132
    following:
G
guosheng 已提交
133 134 135 136 137

        .. math::

            {GLU}(a, b)= a \otimes \sigma(b)

Y
ying 已提交
138
    Refer to `Language Modeling with Gated Convolutional Networks
G
guosheng 已提交
139
    <https://arxiv.org/pdf/1612.08083.pdf>`_.
Y
ying 已提交
140

G
guosheng 已提交
141 142
    Args:
        input (Variable): The input variable which is a Tensor or LoDTensor.
Y
ying 已提交
143
        dim (int): The dimension along which to split. If :math:`dim < 0`, the
G
guosheng 已提交
144 145 146 147 148 149 150 151 152
            dimension to split along is :math:`rank(input) + dim`.

    Returns:
        Variable: The Tensor variable with half the size of input.

    Examples:
        .. code-block:: python

            # x is a Tensor variable with shape [3, 6, 9]
153
            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
G
guosheng 已提交
154 155 156
    """

    a, b = layers.split(input, num_or_sections=2, dim=dim)
G
guosheng 已提交
157 158
    act_b = layers.sigmoid(x=b)
    out = layers.elementwise_mul(x=a, y=act_b)
G
guosheng 已提交
159
    return out
160 161


Y
ying 已提交
162 163 164
def scaled_dot_product_attention(queries,
                                 keys,
                                 values,
Y
ying 已提交
165
                                 num_heads=1,
Y
ying 已提交
166
                                 dropout_rate=0.):
167 168 169
    """
    The dot-product attention.

170 171 172
    Attention mechanism can be seen as mapping a query and a set of key-value
    pairs to an output. The output is computed as a weighted sum of the values,
    where the weight assigned to each value is computed by a compatibility
173
    function (dot-product here) of the query with the corresponding key.
Y
ying 已提交
174 175

    The dot-product attention can be implemented through (batch) matrix
176 177 178 179
    multipication as follows:

        .. math::

180
            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
181

Y
ying 已提交
182
    Refer to `Attention Is All You Need
183 184
    <https://arxiv.org/pdf/1706.03762.pdf>`_.

Y
ying 已提交
185
    Args:
186

Y
ying 已提交
187 188 189 190 191 192 193
        queries (Variable): The input variable which should be a 3-D Tensor.
        keys (Variable): The input variable which should be a 3-D Tensor.
        values (Variable): The input variable which should be a 3-D Tensor.
        num_heads (int): Head number to compute the scaled dot product
                         attention. Default value is 1.
        dropout_rate (float): The dropout rate to drop the attention weight.
                              Default value is 0.
194 195

    Returns:
Y
ying 已提交
196 197

        Variable: A 3-D Tensor computed by multi-head scaled dot product
198
                  attention.
199 200 201 202

    Examples:
        .. code-block:: python

Y
ying 已提交
203 204 205 206
            # Suppose q, k, v are Tensors with the following shape:
            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]

            contexts = fluid.nets.dot_product_attention(q, k, v)
207 208 209
            out.shape  # [3, 5, 10]
            attn_scores.shape  # [3, 5, 6]
    """
Y
ying 已提交
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs quries, keys and values should all be 3-D tensors.")

    if queries.shape[-1] != keys.shape[-1]:
        raise ValueError(
            "The hidden size of queries and keys should be the same.")
    if keys.shape[-2] != values.shape[-2]:
        raise ValueError(
            "The max sequence length in query batch and in key batch "
            "should be the same.")
    if keys.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of keys (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (keys.shape[-1], num_heads))
    if values.shape[-1] % num_heads != 0:
        raise ValueError("The hidden size of values (%d) must be divisible "
                         "by the number of attention heads (%d)." %
                         (values.shape[-1], num_heads))

Y
ying 已提交
230 231 232 233 234 235 236 237 238
    def __compute_qkv(queries, keys, values, num_heads):
        if num_heads == 1:
            return queries, keys, values

        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
        return q, k, v

Y
ying 已提交
239 240 241 242 243 244
    def __split_heads(x, num_heads):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions.

        Args:
Y
ying 已提交
245 246
            x(Tensor): a 3-D input Tensor.
            num_heads(int): The number of heads.
Y
ying 已提交
247 248

        Returns:
Y
ying 已提交
249 250
            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
                    of the last dimension of x.
Y
ying 已提交
251
        """
Y
ying 已提交
252 253
        if num_heads == 1:
            return x
254

Y
ying 已提交
255
        hidden_size = x.shape[-1]
256 257 258
        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
        # into a 4-D output:
        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
Y
ying 已提交
259
        reshaped = layers.reshape(
260 261
            x=x,
            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
262 263

        # permuate the dimensions into:
264 265 266 267
        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
Y
ying 已提交
268 269 270 271 272 273 274 275 276 277 278 279 280
        """
        Reshape the last two dimensions of inpunt tensor x so that it becomes
        one dimension.

        Args:
            x(Tensor): a 4-D input Tensor with shape
                       [bs, num_heads, max_sequence_length, hidden_dim].

        Returns:
            Tensor: a Tensor with shape
                    [bs, max_sequence_length, num_heads * hidden_dim].
        """

281 282 283 284
        if len(x.shape) == 3: return
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

Y
ying 已提交
285
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
Y
ying 已提交
286
        return layers.reshape(
287
            x=trans_x,
Y
ying 已提交
288 289 290 291
            shape=map(int, [
                trans_x.shape[0], trans_x.shape[1],
                trans_x.shape[2] * trans_x.shape[3]
            ]))
292

Y
ying 已提交
293 294 295 296 297
    q, k, v = __compute_qkv(queries, keys, values, num_heads)

    q = __split_heads(q, num_heads)
    k = __split_heads(k, num_heads)
    v = __split_heads(v, num_heads)
Y
ying 已提交
298 299

    key_dim_per_head = keys.shape[-1] // num_heads
300 301
    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
Y
ying 已提交
302

Y
ying 已提交
303
    weights = layers.reshape(
304
        x=layers.reshape(
Y
ying 已提交
305
            x=product, shape=[-1, product.shape[-1]], act="softmax"),
306
        shape=product.shape)
Y
ying 已提交
307 308 309 310
    if dropout_rate:
        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
    ctx_multiheads = layers.matmul(weights, v)
    return __combine_heads(ctx_multiheads)