layer_helper_base.py 20.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import numpy as np
17
import paddle
18

19 20 21 22 23 24 25 26
from .framework import (
    Variable,
    default_main_program,
    default_startup_program,
    _non_static_mode,
    _current_expected_place,
    _in_eager_without_dygraph_check,
)
27 28 29
from . import unique_name
from .param_attr import ParamAttr, WeightNormParamAttr
from . import core
30
from .initializer import _global_weight_initializer, _global_bias_initializer
31

32 33
__all__ = ['LayerHelperBase']

34

35
class LayerHelperBase:
36 37 38
    # global dtype
    __dtype = "float32"

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
    def __init__(self, name, layer_type):
        self._layer_type = layer_type
        self._name = name

    @property
    def name(self):
        return self._name

    @property
    def layer_type(self):
        return self._layer_type

    @property
    def main_program(self):
        return default_main_program()

    @property
    def startup_program(self):
        return default_startup_program()

59 60 61 62 63 64 65 66
    @classmethod
    def set_default_dtype(cls, dtype):
        cls.__dtype = dtype

    @classmethod
    def get_default_dtype(cls):
        return cls.__dtype

67
    def to_variable(self, value, name=None):
68
        r"""
69 70 71 72 73 74 75 76 77 78
        The API will create a ``Variable`` object from numpy\.ndarray or Variable object.

        Parameters:
            value(ndarray): The numpy\.ndarray object that needs to be converted, it can be multi-dimension, and the data type is one of numpy\.{float16, float32, float64, int16, int32, int64, uint8, uint16}.
            name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`

        Returns:
            Variable: ``Tensor`` created from the specified numpy\.ndarray object, data type and shape is the same as ``value`` .

        Examples:
79

80 81 82 83 84 85 86 87
         .. code-block:: python

            import numpy as np
            import paddle.fluid as fluid

            with fluid.dygraph.guard():
                x = np.ones([2, 2], np.float32)
                y = fluid.dygraph.to_variable(x)
88 89 90

        """
        if isinstance(value, np.ndarray):
J
Jiabin Yang 已提交
91
            if _in_eager_without_dygraph_check():
92 93 94 95 96 97 98 99
                return core.eager.Tensor(
                    value,
                    _current_expected_place(),
                    False,
                    False,
                    name if name else None,
                    True,
                )
100
            else:
101 102 103 104 105 106 107
                py_var = core.VarBase(
                    value=value,
                    name=name if name else '',
                    persistable=False,
                    place=_current_expected_place(),
                    zero_copy=False,
                )
108
                return py_var
J
Jiabin Yang 已提交
109
        elif isinstance(value, (core.VarBase, Variable, core.eager.Tensor)):
110
            return value
111 112
        else:
            raise TypeError(
113
                "The type of input value is invalid, expected type is 'ndarray' or 'Variable', but received %s"
114 115
                % type(value)
            )
116 117

    def _create_weight_normalize(self, attr, shape, dtype):
118
        from .layers import elementwise_mul
119 120 121

        # Remove these ops when LayerHelper and layers support indicating
        # program and block.
122 123 124 125 126 127 128 129
        def __norm_op(
            x,
            out=None,
            p=2,
            dim=None,
            keep_dim=False,
            block=self.startup_program.global_block(),
        ):
130 131
            if out is None:
                out = block.create_var(
132 133 134
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_norm'])
                    ),
135
                    dtype=dtype,
136 137
                    persistable=False,
                )
138
            abs_out = block.create_var(
139 140 141
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_abs'])
                ),
142
                dtype=dtype,
143 144 145 146 147
                persistable=False,
            )
            block.append_op(
                type='abs', inputs={'X': x}, outputs={'Out': abs_out}
            )
148
            pow_out = block.create_var(
149 150 151
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_pow'])
                ),
152
                dtype=dtype,
153 154 155 156 157 158 159 160
                persistable=False,
            )
            block.append_op(
                type='pow',
                inputs={'X': abs_out},
                outputs={'Out': pow_out},
                attrs={'factor': float(p)},
            )
161
            sum_out = block.create_var(
162 163 164
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_sum'])
                ),
165
                dtype=dtype,
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
                persistable=False,
            )
            block.append_op(
                type='reduce_sum',
                inputs={'X': pow_out},
                outputs={'Out': sum_out},
                attrs={
                    'dim': dim,
                    'keep_dim': keep_dim,
                    'reduce_all': True if dim is None else False,
                },
            )
            block.append_op(
                type='pow',
                inputs={'X': sum_out},
                outputs={'Out': out},
                attrs={'factor': 1.0 / p},
            )
184 185
            return out

186 187 188
        def __reshape_op(
            x, shape, out=None, block=self.startup_program.global_block()
        ):
189 190
            if out is None:
                out = block.create_var(
191 192 193
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_reshape'])
                    ),
194
                    dtype=dtype,
195 196
                    persistable=False,
                )
197
            x_shape = block.create_var(name="Xshape", dtype=x.dtype)
198 199 200 201 202 203
            block.append_op(
                type="reshape2",
                inputs={'X': x},
                attrs={'shape': shape},
                outputs={"Out": out, "XShape": x_shape},
            )
204 205
            return out

206 207 208
        def __transpose_op(
            x, axis, out=None, block=self.startup_program.global_block()
        ):
209 210
            if out is None:
                out = block.create_var(
211 212 213
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_transpose'])
                    ),
214
                    dtype=dtype,
215 216 217 218 219 220 221 222
                    persistable=False,
                )
            block.append_op(
                type='transpose',
                inputs={'X': x},
                outputs={'Out': out},
                attrs={'axis': axis},
            )
223 224
            return out

225 226 227
        def __norm_except_dim(
            x, out=None, dim=None, block=self.startup_program.global_block()
        ):
228 229 230
            """Computes the norm over all dimensions except dim"""
            if out is None:
                out = block.create_var(
231 232 233
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_norm'])
                    ),
234
                    dtype=dtype,
235 236
                    persistable=False,
                )
237 238 239 240 241
            if dim is None:
                __norm_op(x, out, dim=dim, block=block)
            elif dim == 0:
                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
242
                norm = __norm_op(reshape, dim=[1], block=block)
243 244 245 246
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            elif dim == len(x.shape) - 1:
                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
247
                norm = __norm_op(reshape, dim=[0], block=block)
248 249 250 251 252
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            else:
                perm = list(range(len(x.shape)))
                perm[0], perm[dim] = dim, 0
                transpose = __transpose_op(x, perm, block=block)
253 254 255 256 257 258
                out_shape = [transpose.shape[0]] + [1] * (
                    len(transpose.shape) - 1
                )
                reshape = __reshape_op(
                    transpose, shape=[transpose.shape[0], -1], block=block
                )
259 260 261
                norm = __norm_op(reshape, dim=[1], block=block)
                reshape2 = __reshape_op(norm, shape=out_shape, block=block)
                __transpose_op(reshape2, perm, out=out, block=block)
262 263 264 265
            return out

        def __weight_normalize(g, v, dim):
            """Calculations for weight normalization"""
266 267 268
            norm = __norm_except_dim(
                v, dim=dim, block=self.main_program.current_block()
            )
269
            scale = paddle.divide(
270 271
                x=g, y=norm
            )  # The shapes of g and norm are the same.
272 273
            # Currently, elementwise_mul only support broadcast when the shape
            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
274
            # to achieve the subset.
275 276 277 278
            w = elementwise_mul(
                x=v,
                y=scale
                if dim is None
279
                else paddle.reshape(x=scale, shape=[v.shape[dim]]),
280 281
                axis=-1 if dim is None else dim,
            )
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
            # To serialize the original parameter for inference, maybe a
            # parameter rather than a variable should be returned.
            return w

        g_param_attr = copy.deepcopy(attr)
        g_param_attr.name = attr.name + '_g'
        g_param_shape = [1] * len(shape)
        if attr.dim is not None:
            g_param_shape[attr.dim] = shape[attr.dim]
        v_param_attr = copy.deepcopy(attr)
        v_param_attr.name = attr.name + '_v'
        v_param_shape = shape

        # Add to startup_program to initialize g and v.
        # Try to reconstruct the initializer of w by initializing g and v.
        # Set the initializers of g and v as below, then the distribution
        # of w is the same as initializing w with the given initializer.
        # For Data-Dependent Initialization, please compute the init-values
        # of g and v in external and then feed the values to g and v by
        # executing an extra program.
        g_param = self.startup_program.global_block().create_parameter(
            dtype=dtype,
            shape=g_param_shape,
305 306
            **g_param_attr._to_kwargs(with_initializer=False)
        )
307 308 309
        v_param = self.startup_program.global_block().create_parameter(
            dtype=dtype,
            shape=v_param_shape,
310 311 312 313 314 315 316 317
            **v_param_attr._to_kwargs(with_initializer=True)
        )
        __norm_except_dim(
            x=v_param,
            out=g_param,
            dim=attr.dim,
            block=self.startup_program.global_block(),
        )
318

319
        # keep g_param shape to be consistent with that in main_program
320 321 322 323 324 325
        __reshape_op(
            g_param,
            g_param_shape,
            out=g_param,
            block=self.startup_program.global_block(),
        )
326

327 328
        # Add weight normalization to main_program
        g_param = self.main_program.global_block().create_parameter(
329 330
            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()
        )
331
        v_param = self.main_program.global_block().create_parameter(
332 333
            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs()
        )
334 335 336 337
        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
        return w_param

    # TODO: hide the func after we move the layers to Layers
338 339 340 341 342 343 344 345 346 347
    def create_parameter(
        self,
        attr,
        shape,
        dtype=None,
        is_bias=False,
        default_initializer=None,
        stop_gradient=False,
        type=core.VarDesc.VarType.LOD_TENSOR,
    ):
348 349 350 351
        """Create parameters for this layers.

           Args:
               attr: [ParamAttr] should be the parameter attribute for this parameter
T
tianshuo78520a 已提交
352
               shape: shape of the parameter
353 354 355 356 357 358 359 360
               dtype: data type of this parameter
               is_bias: if this is a bias parameter
               default_initializer: set the default initializer for this parameter

        Returns created parameter Variable.
        """
        # Deepcopy the attr so that parameters can be shared in program
        attr = copy.deepcopy(attr)
361
        attr = ParamAttr._to_attr(attr)
362 363
        if not attr:
            return None
364
        assert isinstance(attr, ParamAttr)
365
        for i, size in enumerate(shape):
366 367 368 369
            assert size > 0, (
                "Expected every dim's size to be larger than 0, "
                "but the size of the {}-th dim is {}".format(i, size)
            )
370 371 372
        # set global dtype
        if not dtype:
            dtype = self.__dtype
373 374
        if is_bias:
            suffix = 'b'
375 376 377 378 379
            default_initializer = (
                _global_bias_initializer()
                if _global_bias_initializer() is not None
                else default_initializer
            )
380 381
        else:
            suffix = 'w'
382 383 384 385 386
            default_initializer = (
                _global_weight_initializer()
                if _global_weight_initializer() is not None
                else default_initializer
            )
387 388 389 390 391
        if attr.name is None:
            attr.name = unique_name.generate(".".join([self.name, suffix]))

        if default_initializer is None and attr.initializer is None:
            if isinstance(dtype, core.VarDesc.VarType):
392 393 394 395 396 397
                if (
                    dtype != core.VarDesc.VarType.FP32
                    and dtype != core.VarDesc.VarType.FP64
                    and dtype != core.VarDesc.VarType.FP16
                    and dtype != core.VarDesc.VarType.BF16
                ):
398 399 400 401
                    raise TypeError(
                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                    )
            else:
402 403 404
                if not (
                    dtype.startswith("float") or dtype in ["double", "uint16"]
                ):
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
                    raise TypeError(
                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                    )
            if is_bias:
                attr._set_default_bias_initializer()
            else:
                attr._set_default_param_initializer()
        else:
            attr._set_default_initializer(default_initializer)

        # If weight normalization is set, insert extra parameters and ops.
        # Refer to https://arxiv.org/pdf/1602.07868.pdf
        if isinstance(attr, WeightNormParamAttr):
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
J
Jiabin Yang 已提交
421
        if _non_static_mode():
L
lujun 已提交
422
            # In dygraph mode, we want the returned parameter to be
423
            # initialized so that it can be used imperatively.
H
hong 已提交
424 425 426 427 428 429 430
            # check parameter name
            is_used = unique_name.dygraph_parameter_name_checker(attr.name)
            if is_used:
                raise ValueError(
                    "parameter name [{}] have be been used. "
                    "In dygraph mode, the name of parameter can't be same."
                    "Please check the parameter attr value passed to self.create_parameter or "
431 432
                    "constructor of dygraph Layers".format(attr.name)
                )
433 434 435
            return self.main_program.global_block().create_parameter(
                dtype=dtype,
                shape=shape,
436
                type=type,
437
                stop_gradient=stop_gradient,
438 439
                **attr._to_kwargs(with_initializer=True)
            )
440 441 442 443
        else:
            self.startup_program.global_block().create_parameter(
                dtype=dtype,
                shape=shape,
444
                type=type,
445 446
                **attr._to_kwargs(with_initializer=True)
            )
447
            return self.main_program.global_block().create_parameter(
448 449
                dtype=dtype, shape=shape, type=type, **attr._to_kwargs()
            )
450

451 452 453
    def create_variable_for_type_inference(
        self, dtype, stop_gradient=False, shape=None
    ):
454 455 456 457 458 459 460 461
        """Create a temporary variable that should be type inferred layer.

        Note:
            The default type will be set to LOD_TENSOR. However, when
            the var is used as operator output, its type will be updated
            based on operator's `VarTypeInference` implementation in
            infer_var_type.
        """
462 463 464
        # set global dtype
        if not dtype:
            dtype = self.__dtype
465
        return self.main_program.current_block().create_var(
466 467 468
            name=unique_name.generate_with_ignorable_key(
                ".".join([self.name, 'tmp'])
            ),
469
            dtype=dtype,
470
            shape=shape,
471 472
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
473 474
            stop_gradient=stop_gradient,
        )
475

476 477 478
    def create_sparse_variable_for_type_inference(
        self, dtype, stop_gradient=False, shape=None
    ):
479 480 481 482 483 484 485 486 487 488 489 490
        """Create a temporary sparse variable that should be type inferred layer.

        Note:
            The default type will be set to SPARSE_COO. However, when
            the var is used as operator output, its type will be updated
            based on operator's `VarTypeInference` implementation in
            infer_var_type.
        """
        # set global dtype
        if not dtype:
            dtype = self.__dtype
        return self.main_program.current_block().create_var(
491 492 493
            name=unique_name.generate_with_ignorable_key(
                ".".join([self.name, 'tmp'])
            ),
494 495 496 497
            dtype=dtype,
            shape=shape,
            type=core.VarDesc.VarType.SPARSE_COO,
            persistable=False,
498 499
            stop_gradient=stop_gradient,
        )
500

501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
    def create_variable(self, *args, **kwargs):
        """Create Variable for this layers.
        Returns created Variable.
        """
        return self.main_program.current_block().create_var(*args, **kwargs)

    def create_global_variable(self, persistable=False, *args, **kwargs):
        """
        create global variable, note that there is no initializer for this global variable.
        Args:
            persistable(bool): True if it is a checkpoint value.
            *args: See create_var's documentation
            **kwargs: See create_var's documentation

        Returns(Variable): the created variable.
        """
        return self.main_program.global_block().create_var(
518 519
            *args, persistable=persistable, **kwargs
        )
520 521 522 523 524 525 526 527 528 529 530 531 532 533

    def create_or_get_global_variable(self, name, *args, **kwargs):
        """
        Creates a global variable if not exists and returns the variable and
        a boolean flag which is true when it is a new variable.
        """
        if self.main_program.global_block().has_var(name):
            return self.main_program.global_block().var(name), False
        else:
            return self.create_global_variable(name=name, *args, **kwargs), True

    def set_variable_initializer(self, var, initializer):
        """Set target Variable's initializer

534 535 536
        Args:
            var: target Variable
            initializer: initializer to use
537 538
        """
        assert isinstance(var, Variable)
J
Jiabin Yang 已提交
539
        if _non_static_mode():
540
            initializer(var, self.main_program.global_block())
541 542 543 544 545 546 547
        else:
            self.startup_program.global_block().create_var(
                name=var.name,
                type=var.type,
                dtype=var.dtype,
                shape=var.shape,
                persistable=True,
548 549
                initializer=initializer,
            )