nn.py 26.6 KB
Newer Older
M
minqiyang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

from six.moves import reduce

from .. import core
from ..layers import utils
from . import layers
from ..framework import Variable, OpProtoHolder
23
from ..layers import layer_function_generator
M
minqiyang 已提交
24 25
from ..param_attr import ParamAttr
from ..initializer import Normal, Constant
26 27 28
__all__ = [
    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm'
]
M
minqiyang 已提交
29 30


X
Xin Pan 已提交
31
class Conv2D(layers.Layer):
M
minqiyang 已提交
32
    def __init__(self,
X
Xin Pan 已提交
33
                 name_scope,
M
minqiyang 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46
                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=None,
                 use_cudnn=True,
                 act=None,
                 param_attr=None,
                 bias_attr=None,
                 dtype=core.VarDesc.VarType.FP32):
        assert param_attr is not False, "param_attr should not be False here."
47
        super(Conv2D, self).__init__(name_scope)
M
minqiyang 已提交
48 49 50 51
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, 'stride')
        self._padding = utils.convert_to_list(padding, 2, 'padding')
        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
52
        self._act = act
M
minqiyang 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")
        self._use_cudnn = use_cudnn
        self._num_channels = num_channels
        if (self._num_channels == self._groups and
                num_filters % self._num_channels == 0 and not self._use_cudnn):
            self._l_type = 'depthwise_conv2d'
        else:
            self._l_type = 'conv2d'

        if groups is None:
            num_filter_channels = num_channels
        else:
            if num_channels % groups != 0:
                raise ValueError("num_channels must be divisible by groups.")
            num_filter_channels = num_channels // groups
        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
        filter_shape = [num_filters, int(num_filter_channels)] + filter_size

        def _get_default_param_initializer():
            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
            std = (2.0 / filter_elem_num)**0.5
            return Normal(0.0, std, 0)

77 78
        self._filter_param = self.create_parameter(
            attr=param_attr,
M
minqiyang 已提交
79 80 81 82 83
            shape=filter_shape,
            dtype=self._dtype,
            default_initializer=_get_default_param_initializer())

        if self._use_cudnn:
84
            self.create_variable(
M
minqiyang 已提交
85 86 87
                name="kCUDNNFwdAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
88
            self.create_variable(
M
minqiyang 已提交
89 90 91
                name="kCUDNNBwdDataAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
92
            self.create_variable(
M
minqiyang 已提交
93 94 95 96
                name="kCUDNNBwdFilterAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)

97 98
        self._bias_param = self.create_parameter(
            attr=bias_attr,
M
minqiyang 已提交
99
            shape=[num_filters],
M
minqiyang 已提交
100 101
            dtype=self._dtype,
            is_bias=True)
M
minqiyang 已提交
102 103

    def forward(self, input):
M
minqiyang 已提交
104 105 106
        pre_bias = self._helper.create_variable_for_type_inference(
            dtype=self._dtype)

M
minqiyang 已提交
107 108 109 110 111 112
        self._helper.append_op(
            type=self._l_type,
            inputs={
                'Input': input,
                'Filter': self._filter_param,
            },
M
minqiyang 已提交
113
            outputs={"Output": pre_bias},
M
minqiyang 已提交
114 115 116 117
            attrs={
                'strides': self._stride,
                'paddings': self._padding,
                'dilations': self._dilation,
118
                'groups': self._groups if self._groups else 1,
M
minqiyang 已提交
119 120 121 122
                'use_cudnn': self._use_cudnn,
                'use_mkldnn': False,
            })

M
minqiyang 已提交
123 124
        pre_act = self._helper.create_variable_for_type_inference(
            dtype=self._dtype)
M
minqiyang 已提交
125

M
minqiyang 已提交
126 127 128 129 130 131 132
        self._helper.append_op(
            type='elementwise_add',
            inputs={'X': [pre_bias],
                    'Y': [self._bias_param]},
            outputs={'Out': [pre_act]},
            attrs={'axis': 1})

M
minqiyang 已提交
133
        # Currently, we don't support inplace in imperative mode
134
        return self._helper.append_activation(pre_act, act=self._act)
M
minqiyang 已提交
135 136


X
Xin Pan 已提交
137
class Pool2D(layers.Layer):
M
minqiyang 已提交
138
    def __init__(self,
X
Xin Pan 已提交
139
                 name_scope,
M
minqiyang 已提交
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
                 pool_size=-1,
                 pool_type="max",
                 pool_stride=1,
                 pool_padding=0,
                 global_pooling=False,
                 use_cudnn=True,
                 ceil_mode=False,
                 exclusive=True,
                 dtype=core.VarDesc.VarType.FP32):
        if pool_type not in ["max", "avg"]:
            raise ValueError(
                "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
                str(pool_type))

        if global_pooling is False and pool_size == -1:
            raise ValueError(
                "When the global_pooling is False, pool_size must be passed "
                "and be a valid value. Received pool_size: " + str(pool_size))

        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")

X
Xin Pan 已提交
162
        super(Pool2D, self).__init__(name_scope, dtype=dtype)
M
minqiyang 已提交
163 164 165 166 167 168 169 170 171 172 173 174 175

        self._pool_type = pool_type
        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
        self._pool_padding = utils.convert_to_list(pool_padding, 2,
                                                   'pool_padding')
        self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
        self._global_pooling = global_pooling
        self._use_cudnn = use_cudnn
        self._ceil_mode = ceil_mode
        self._exclusive = exclusive
        self._l_type = 'pool2d'

    def forward(self, input):
M
minqiyang 已提交
176 177
        pool_out = self._helper.create_variable_for_type_inference(self._dtype)

M
minqiyang 已提交
178 179 180
        self._helper.append_op(
            type=self._l_type,
            inputs={"X": input},
M
minqiyang 已提交
181
            outputs={"Out": pool_out},
M
minqiyang 已提交
182 183 184 185 186 187 188 189 190 191 192
            attrs={
                "pooling_type": self._pool_type,
                "ksize": self._pool_size,
                "global_pooling": self._global_pooling,
                "strides": self._pool_stride,
                "paddings": self._pool_padding,
                "use_cudnn": self._use_cudnn,
                "ceil_mode": self._ceil_mode,
                "use_mkldnn": False,
                "exclusive": self._exclusive,
            })
M
minqiyang 已提交
193
        return pool_out
M
minqiyang 已提交
194 195


X
Xin Pan 已提交
196
class FC(layers.Layer):
M
minqiyang 已提交
197
    def __init__(self,
X
Xin Pan 已提交
198
                 name_scope,
M
minqiyang 已提交
199
                 size,
M
minqiyang 已提交
200
                 param_attr=None,
M
minqiyang 已提交
201
                 bias_attr=None,
M
minqiyang 已提交
202
                 num_flatten_dims=1,
X
Xin Pan 已提交
203
                 dtype=core.VarDesc.VarType.FP32,
X
Xin Pan 已提交
204 205
                 act=None):
        super(FC, self).__init__(name_scope)
M
minqiyang 已提交
206

M
minqiyang 已提交
207
        self._size = size
M
minqiyang 已提交
208 209
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
210
        self._param_attr = param_attr
211
        self._bias_attr = bias_attr
212
        self._act = act
M
minqiyang 已提交
213 214 215 216 217

    def _build_once(self, input):
        input_shape = input.shape
        param_shape = [
            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
M
minqiyang 已提交
218
        ] + [self._size]
219 220
        self._w = self.create_parameter(
            attr=self._param_attr,
M
minqiyang 已提交
221 222 223
            shape=param_shape,
            dtype=self._dtype,
            is_bias=False)
224

225
        if self._bias_attr:
226
            size = list([self._size])
227
            self._b = self.create_parameter(
228
                attr=self._bias_attr,
229 230 231 232 233
                shape=size,
                dtype=self._dtype,
                is_bias=True)
        else:
            self._b = None
M
minqiyang 已提交
234 235

    def forward(self, input):
M
minqiyang 已提交
236
        tmp = self._helper.create_variable_for_type_inference(self._dtype)
M
minqiyang 已提交
237 238 239 240
        self._helper.append_op(
            type="mul",
            inputs={"X": input,
                    "Y": self._w},
M
minqiyang 已提交
241
            outputs={"Out": tmp},
M
minqiyang 已提交
242 243 244 245 246
            attrs={
                "x_num_col_dims": self._num_flatten_dims,
                "y_num_col_dims": 1
            })

M
minqiyang 已提交
247
        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
M
minqiyang 已提交
248 249
        self._helper.append_op(
            type="sum",
M
minqiyang 已提交
250
            inputs={"X": [tmp]},
M
minqiyang 已提交
251
            outputs={"Out": pre_bias},
M
minqiyang 已提交
252
            attrs={"use_mkldnn": False})
M
minqiyang 已提交
253

254 255 256 257 258 259 260 261 262 263 264
        if self._b:
            pre_activation = self._helper.create_variable_for_type_inference(
                dtype=self._dtype)
            self._helper.append_op(
                type='elementwise_add',
                inputs={'X': [pre_bias],
                        'Y': [self._b]},
                outputs={'Out': [pre_activation]},
                attrs={'axis': self._num_flatten_dims})
        else:
            pre_activation = pre_bias
M
minqiyang 已提交
265
        # Currently, we don't support inplace in imperative mode
266
        return self._helper.append_activation(pre_activation, act=self._act)
M
minqiyang 已提交
267 268 269 270


class BatchNorm(layers.Layer):
    def __init__(self,
X
Xin Pan 已提交
271
                 name_scope,
M
minqiyang 已提交
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
                 num_channels,
                 act=None,
                 is_test=False,
                 momentum=0.9,
                 epsilon=1e-05,
                 param_attr=None,
                 bias_attr=None,
                 dtype=core.VarDesc.VarType.FP32,
                 data_layout='NCHW',
                 in_place=False,
                 moving_mean_name=None,
                 moving_variance_name=None,
                 do_model_average_for_mean_and_var=False,
                 fuse_with_relu=False,
                 use_global_stats=False):
X
Xin Pan 已提交
287
        super(BatchNorm, self).__init__(name_scope)
288 289 290
        self._param_attr = param_attr
        self._param_attr = bias_attr
        self._act = act
M
minqiyang 已提交
291 292 293 294 295 296 297 298 299 300 301

        assert bias_attr is not False, "bias_attr should not be False in batch_norm."

        if dtype == core.VarDesc.VarType.FP16:
            self._dtype = core.VarDesc.VarType.FP32
        else:
            self._dtype = dtype

        param_shape = [num_channels]

        # create parameter
302 303
        self._scale = self.create_parameter(
            attr=self._param_attr,
M
minqiyang 已提交
304 305 306
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0))
307
        if use_global_stats and self._param_attr.learning_rate == 0.:
M
minqiyang 已提交
308
            self._scale._stop_gradient = True
M
minqiyang 已提交
309

310 311
        self._bias = self.create_parameter(
            attr=self._param_attr,
M
minqiyang 已提交
312 313 314
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True)
315
        if use_global_stats and self._param_attr.learning_rate == 0.:
M
minqiyang 已提交
316
            self._bias._stop_gradient = True
M
minqiyang 已提交
317

318
        self._mean = self.create_parameter(
M
minqiyang 已提交
319 320 321 322 323 324 325
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var),
            shape=param_shape,
            dtype=self._dtype)
M
minqiyang 已提交
326
        self._mean._stop_gradient = True
M
minqiyang 已提交
327

328
        self._variance = self.create_parameter(
M
minqiyang 已提交
329 330 331 332 333 334 335
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var),
            shape=param_shape,
            dtype=self._dtype)
M
minqiyang 已提交
336
        self._variance._stop_gradient = True
M
minqiyang 已提交
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355

        self._in_place = in_place
        self._momentum = momentum
        self._epsilon = epsilon
        self._is_test = is_test
        self._fuse_with_relu = fuse_with_relu
        self._use_global_stats = use_global_stats

    def _build_once(self, input):
        pass

    def forward(self, input):
        # create output
        # mean and mean_out share the same memory
        mean_out = self._mean
        # variance and variance out share the same memory
        variance_out = self._variance

        saved_mean = self._helper.create_variable_for_type_inference(
M
minqiyang 已提交
356
            dtype=self._dtype, stop_gradient=True)
M
minqiyang 已提交
357
        saved_variance = self._helper.create_variable_for_type_inference(
M
minqiyang 已提交
358
            dtype=self._dtype, stop_gradient=True)
M
minqiyang 已提交
359
        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
M
minqiyang 已提交
360
            self._dtype)
M
minqiyang 已提交
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386

        self._helper.append_op(
            type="batch_norm",
            inputs={
                "X": input,
                "Scale": self._scale,
                "Bias": self._bias,
                "Mean": self._mean,
                "Variance": self._variance
            },
            outputs={
                "Y": batch_norm_out,
                "MeanOut": mean_out,
                "VarianceOut": variance_out,
                "SavedMean": saved_mean,
                "SavedVariance": saved_variance
            },
            attrs={
                "momentum": self._momentum,
                "epsilon": self._epsilon,
                "is_test": self._is_test,
                "use_mkldnn": False,
                "fuse_with_relu": self._fuse_with_relu,
                "use_global_stats": self._use_global_stats
            })

M
minqiyang 已提交
387
        # Currently, we don't support inplace in imperative mode
388
        return self._helper.append_activation(batch_norm_out, self._act)
389 390


391 392 393 394 395 396 397 398 399 400 401 402
class Embedding(layers.Layer):
    """
    **Embedding Layer**

    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
    a lookup table. The result of this lookup is the embedding of each ID in the
    :attr:`input`.

    All the input variables are passed in as local variables to the LayerHelper
    constructor.

    Args:
X
Xin Pan 已提交
403
        name_scope: See base class.
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
        size(tuple|list): The shape of the look up table parameter. It should
            have two elements which indicate the size of the dictionary of
            embeddings and the size of each embedding vector respectively.
        is_sparse(bool): The flag indicating whether to use sparse update.
        is_distributed(bool): Whether to run lookup table from remote parameter server.
        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
            Otherwise the given :attr:`padding_idx` indicates padding the output
            with zeros whenever lookup encounters it in :attr:`input`. If
            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
            :math:`size[0] + dim`.
        param_attr(ParamAttr): Parameters for this layer
        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc

    Returns:
        Variable: The tensor variable storing the embeddings of the \
                  supplied inputs.

    Examples:
        .. code-block:: python

          dict_size = len(dataset.ids)
          input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
          embedding = fluid.imperative.Embedding(size=[dict_size, 16])
          fc = embedding(input)
    """

430
    def __init__(self,
X
Xin Pan 已提交
431
                 name_scope,
432 433 434 435 436 437 438
                 size,
                 is_sparse=False,
                 is_distributed=False,
                 padding_idx=None,
                 param_attr=None,
                 dtype='float32'):

X
Xin Pan 已提交
439
        super(Embedding, self).__init__(name_scope)
440 441 442 443
        self._size = size
        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
J
JiabinYang 已提交
444
            size[0] + padding_idx)
445 446 447

        self._param_attr = param_attr
        self._dtype = dtype
J
JiabinYang 已提交
448
        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
449 450 451
        if self._remote_prefetch:
            assert self._is_sparse is True and self._is_distributed is False

452
        self._w = self.create_parameter(
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
            attr=self._param_attr,
            shape=self._size,
            dtype=self._dtype,
            is_bias=False)

    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
        self._helper.append_op(
            type='lookup_table',
            inputs={'Ids': input,
                    'W': self._w},
            outputs={'Out': out},
            attrs={
                'is_sparse': self._is_sparse,
                'is_distributed': self._is_distributed,
                'remote_prefetch': self._remote_prefetch,
                'padding_idx': self._padding_idx
            })

        return out
M
minqiyang 已提交
473 474


475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
class LayerNorm(layers.Layer):
    def __init__(self,
                 name_scope,
                 scale=True,
                 shift=True,
                 begin_norm_axis=1,
                 epsilon=1e-05,
                 param_attr=None,
                 bias_attr=None,
                 act=None):
        """
        ${comment}

        The formula is as follows:

        ..  math::

            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i

            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}

            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)

        * :math:`a`: the vector representation of the summed inputs to the neurons
        in that layer.

        * :math:`H`: the number of hidden units in a layers

        * :math:`g`: the trainable scale parameter.

        * :math:`b`: the trainable bias parameter.

        Args:
            input(Variable): The input tensor variable.
            scale(bool): Whether to learn the adaptive gain :math:`g` after
                normalization. Default True.
            shift(bool): Whether to learn the adaptive bias :math:`b` after
                normalization. Default True.
            begin_norm_axis(int): The normalization will be performed along
                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
                Default 1.
            epsilon(float): The small value added to the variance to prevent
                division by zero. Default 1e-05.
            param_attr(ParamAttr|None): The parameter attribute for the learnable
                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
                a default :code:`ParamAttr` would be added as scale. The
                :attr:`param_attr` is initialized as 1 if it is added. Default None.
            bias_attr(ParamAttr|None): The parameter attribute for the learnable
                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
                a default :code:`ParamAttr` would be added as bias. The
                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
            act(str): Activation to be applied to the output of layer normalizaiton.
                      Default None.
        Returns:
            ${y_comment}

        Examples:

            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
            >>>                          dtype='float32')
            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
        """

        super(LayerNorm, self).__init__(name_scope)
        self._scale = scale
        self._shift = shift
        self._begin_norm_axis = begin_norm_axis
        self._epsilon = epsilon
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._act = act

    def _build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        input_shape = input.shape
        param_shape = [
            reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
        ]
        if self._scale:
            self._scale_w = self.create_parameter(
                attr=self._param_attr,
                shape=param_shape,
                dtype=self._dtype,
                default_initializer=Constant(1.0))
        if self._shift:
            assert self._bias_attr is not False
            self._bias_w = self.create_parameter(
                attr=self._bias_attr,
                shape=param_shape,
                dtype=self._dtype,
                is_bias=True)

    def forward(self, input):
        inputs = dict()
        inputs['X'] = input
        if self._scale:
            inputs['Scale'] = self._scale_w
        if self._shift:
            inputs['Bias'] = self._bias_w
        # create output
        mean_out = self._helper.create_variable_for_type_inference(
            dtype=self._dtype, stop_gradient=True)
        variance_out = self._helper.create_variable_for_type_inference(
            dtype=self._dtype, stop_gradient=True)
        layer_norm_out = self._helper.create_variable_for_type_inference(
            self._dtype)

        self._helper.append_op(
            type="layer_norm",
            inputs=inputs,
            outputs={
                "Y": layer_norm_out,
                "Mean": mean_out,
                "Variance": variance_out,
            },
            attrs={
                "epsilon": self._epsilon,
                "begin_norm_axis": self._begin_norm_axis
            })

        return self._helper.append_activation(layer_norm_out)


M
minqiyang 已提交
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
class GRUUnit(layers.Layer):
    """
    **GRU unit layer**

    if origin_mode is True, then the equation of a gru step is from paper
    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_

        .. math::
            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)

            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)

            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)

            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)

    if origin_mode is False, then the equation of a gru step is from paper
    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_

        .. math::
            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)

            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)

            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)

            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)


    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
    of the equation above, the :math:`z_t` is split into 3 parts -
    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
    implement a full GRU unit operator for an input, a fully
    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.

    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.

    Args:
        input (Variable): The fc transformed input value of current step.
M
minqiyang 已提交
645
        name_scope (str): See base class.
M
minqiyang 已提交
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
        hidden (Variable): The hidden value of gru unit from previous step.
        size (integer): The input dimension value.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            hidden-hidden weight matrix. Note:

            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
              :math:`D` is the hidden size.
            - All elements in the weight matrix can be divided into two parts.
              The first part are weights of the update gate and reset gate with
              shape :math:`(D \\times 2D)`, and the second part are weights for
              candidate hidden state with shape :math:`(D \\times D)`.

            If it is set to None or one attribute of ParamAttr, gru_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
            the bias in the update gate, reset gate and candidate calculations.
            If it is set to False, no bias will be applied to the update gate,
            reset gate and candidate calculations. If it is set to None or one
            attribute of ParamAttr, gru_unit will create ParamAttr as
            bias_attr. If the Initializer of the bias_attr is not set, the bias
            is initialized zero. Default: None.
        activation (string): The activation type for cell (actNode).
                             Default: 'tanh'
        gate_activation (string): The activation type for gates (actGate).
                                  Default: 'sigmoid'

    Returns:
        tuple: The hidden value, reset-hidden value and gate values.
    """

    def __init__(self,
M
minqiyang 已提交
679
                 name_scope,
M
minqiyang 已提交
680 681 682 683 684 685 686
                 size,
                 param_attr=None,
                 bias_attr=None,
                 activation='tanh',
                 gate_activation='sigmoid',
                 origin_mode=False,
                 dtype='float32'):
M
minqiyang 已提交
687
        super(GRUUnit, self).__init__(name_scope)
M
minqiyang 已提交
688 689 690 691 692 693 694 695 696

        activation_dict = dict(
            identity=0,
            sigmoid=1,
            tanh=2,
            relu=3, )
        activation = activation_dict[activation]
        gate_activation = activation_dict[gate_activation]

M
minqiyang 已提交
697
        self._dtype = dtype
M
minqiyang 已提交
698 699
        size = size // 3
        # create weight
M
minqiyang 已提交
700 701
        self._weight = self.create_parameter(
            attr=param_attr, shape=[size, 3 * size], dtype=dtype)
M
minqiyang 已提交
702 703

        # create bias
M
minqiyang 已提交
704 705 706
        bias_size = [1, 3 * size]
        self._bias = self.create_parameter(
            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
M
minqiyang 已提交
707

M
minqiyang 已提交
708 709 710 711 712 713 714 715 716 717
    def forward(self, input, hidden):
        inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
        if self._bias:
            inputs['Bias'] = self._bias

        gate = self._helper.create_variable_for_type_inference(self._dtype)
        reset_hidden_pre = self._helper.create_variable_for_type_inference(
            self._dtype)
        updated_hidden = self._helper.create_variable_for_type_inference(
            self._dtype)
M
minqiyang 已提交
718 719 720 721 722 723 724 725 726 727 728 729 730 731
        self._helper.append_op(
            type='gru_unit',
            inputs=inputs,
            outputs={
                'Gate': gate,
                'ResetHiddenPrev': reset_hidden_pre,
                'Hidden': updated_hidden,
            },
            attrs={
                'activation': 2,  # tanh
                'gate_activation': 1,  # sigmoid
            })

        return updated_hidden, reset_hidden_pre, gate