layer_helper_base.py 20.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import numpy as np

18 19 20 21 22 23 24 25
from .framework import (
    Variable,
    default_main_program,
    default_startup_program,
    _non_static_mode,
    _current_expected_place,
    _in_eager_without_dygraph_check,
)
26 27 28
from . import unique_name
from .param_attr import ParamAttr, WeightNormParamAttr
from . import core
29
from .initializer import _global_weight_initializer, _global_bias_initializer
30

31 32
__all__ = ['LayerHelperBase']

33

34
class LayerHelperBase:
35 36 37
    # global dtype
    __dtype = "float32"

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
    def __init__(self, name, layer_type):
        self._layer_type = layer_type
        self._name = name

    @property
    def name(self):
        return self._name

    @property
    def layer_type(self):
        return self._layer_type

    @property
    def main_program(self):
        return default_main_program()

    @property
    def startup_program(self):
        return default_startup_program()

58 59 60 61 62 63 64 65
    @classmethod
    def set_default_dtype(cls, dtype):
        cls.__dtype = dtype

    @classmethod
    def get_default_dtype(cls):
        return cls.__dtype

66
    def to_variable(self, value, name=None):
67
        r"""
68 69 70 71 72 73 74 75 76 77
        The API will create a ``Variable`` object from numpy\.ndarray or Variable object.

        Parameters:
            value(ndarray): The numpy\.ndarray object that needs to be converted, it can be multi-dimension, and the data type is one of numpy\.{float16, float32, float64, int16, int32, int64, uint8, uint16}.
            name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`

        Returns:
            Variable: ``Tensor`` created from the specified numpy\.ndarray object, data type and shape is the same as ``value`` .

        Examples:
78

79 80 81 82 83 84 85 86
         .. code-block:: python

            import numpy as np
            import paddle.fluid as fluid

            with fluid.dygraph.guard():
                x = np.ones([2, 2], np.float32)
                y = fluid.dygraph.to_variable(x)
87 88 89

        """
        if isinstance(value, np.ndarray):
J
Jiabin Yang 已提交
90
            if _in_eager_without_dygraph_check():
91 92 93 94 95 96 97 98
                return core.eager.Tensor(
                    value,
                    _current_expected_place(),
                    False,
                    False,
                    name if name else None,
                    True,
                )
99
            else:
100 101 102 103 104 105 106
                py_var = core.VarBase(
                    value=value,
                    name=name if name else '',
                    persistable=False,
                    place=_current_expected_place(),
                    zero_copy=False,
                )
107
                return py_var
J
Jiabin Yang 已提交
108
        elif isinstance(value, (core.VarBase, Variable, core.eager.Tensor)):
109
            return value
110 111
        else:
            raise TypeError(
112
                "The type of input value is invalid, expected type is 'ndarray' or 'Variable', but received %s"
113 114
                % type(value)
            )
115 116 117 118 119 120

    def _create_weight_normalize(self, attr, shape, dtype):
        from .layers import elementwise_mul, elementwise_div, reshape

        # Remove these ops when LayerHelper and layers support indicating
        # program and block.
121 122 123 124 125 126 127 128
        def __norm_op(
            x,
            out=None,
            p=2,
            dim=None,
            keep_dim=False,
            block=self.startup_program.global_block(),
        ):
129 130
            if out is None:
                out = block.create_var(
131 132 133
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_norm'])
                    ),
134
                    dtype=dtype,
135 136
                    persistable=False,
                )
137
            abs_out = block.create_var(
138 139 140
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_abs'])
                ),
141
                dtype=dtype,
142 143 144 145 146
                persistable=False,
            )
            block.append_op(
                type='abs', inputs={'X': x}, outputs={'Out': abs_out}
            )
147
            pow_out = block.create_var(
148 149 150
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_pow'])
                ),
151
                dtype=dtype,
152 153 154 155 156 157 158 159
                persistable=False,
            )
            block.append_op(
                type='pow',
                inputs={'X': abs_out},
                outputs={'Out': pow_out},
                attrs={'factor': float(p)},
            )
160
            sum_out = block.create_var(
161 162 163
                name=unique_name.generate_with_ignorable_key(
                    ".".join([self.name, 'weight_norm_sum'])
                ),
164
                dtype=dtype,
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
                persistable=False,
            )
            block.append_op(
                type='reduce_sum',
                inputs={'X': pow_out},
                outputs={'Out': sum_out},
                attrs={
                    'dim': dim,
                    'keep_dim': keep_dim,
                    'reduce_all': True if dim is None else False,
                },
            )
            block.append_op(
                type='pow',
                inputs={'X': sum_out},
                outputs={'Out': out},
                attrs={'factor': 1.0 / p},
            )
183 184
            return out

185 186 187
        def __reshape_op(
            x, shape, out=None, block=self.startup_program.global_block()
        ):
188 189
            if out is None:
                out = block.create_var(
190 191 192
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_reshape'])
                    ),
193
                    dtype=dtype,
194 195
                    persistable=False,
                )
196
            x_shape = block.create_var(name="Xshape", dtype=x.dtype)
197 198 199 200 201 202
            block.append_op(
                type="reshape2",
                inputs={'X': x},
                attrs={'shape': shape},
                outputs={"Out": out, "XShape": x_shape},
            )
203 204
            return out

205 206 207
        def __transpose_op(
            x, axis, out=None, block=self.startup_program.global_block()
        ):
208 209
            if out is None:
                out = block.create_var(
210 211 212
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_transpose'])
                    ),
213
                    dtype=dtype,
214 215 216 217 218 219 220 221
                    persistable=False,
                )
            block.append_op(
                type='transpose',
                inputs={'X': x},
                outputs={'Out': out},
                attrs={'axis': axis},
            )
222 223
            return out

224 225 226
        def __norm_except_dim(
            x, out=None, dim=None, block=self.startup_program.global_block()
        ):
227 228 229
            """Computes the norm over all dimensions except dim"""
            if out is None:
                out = block.create_var(
230 231 232
                    name=unique_name.generate_with_ignorable_key(
                        ".".join([self.name, 'weight_norm_norm'])
                    ),
233
                    dtype=dtype,
234 235
                    persistable=False,
                )
236 237 238 239 240
            if dim is None:
                __norm_op(x, out, dim=dim, block=block)
            elif dim == 0:
                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
241
                norm = __norm_op(reshape, dim=[1], block=block)
242 243 244 245
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            elif dim == len(x.shape) - 1:
                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
246
                norm = __norm_op(reshape, dim=[0], block=block)
247 248 249 250 251
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            else:
                perm = list(range(len(x.shape)))
                perm[0], perm[dim] = dim, 0
                transpose = __transpose_op(x, perm, block=block)
252 253 254 255 256 257
                out_shape = [transpose.shape[0]] + [1] * (
                    len(transpose.shape) - 1
                )
                reshape = __reshape_op(
                    transpose, shape=[transpose.shape[0], -1], block=block
                )
258 259 260
                norm = __norm_op(reshape, dim=[1], block=block)
                reshape2 = __reshape_op(norm, shape=out_shape, block=block)
                __transpose_op(reshape2, perm, out=out, block=block)
261 262 263 264
            return out

        def __weight_normalize(g, v, dim):
            """Calculations for weight normalization"""
265 266 267
            norm = __norm_except_dim(
                v, dim=dim, block=self.main_program.current_block()
            )
268
            scale = elementwise_div(
269 270
                x=g, y=norm
            )  # The shapes of g and norm are the same.
271 272
            # Currently, elementwise_mul only support broadcast when the shape
            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
273
            # to achieve the subset.
274 275 276 277 278 279 280
            w = elementwise_mul(
                x=v,
                y=scale
                if dim is None
                else reshape(x=scale, shape=[v.shape[dim]]),
                axis=-1 if dim is None else dim,
            )
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
            # To serialize the original parameter for inference, maybe a
            # parameter rather than a variable should be returned.
            return w

        g_param_attr = copy.deepcopy(attr)
        g_param_attr.name = attr.name + '_g'
        g_param_shape = [1] * len(shape)
        if attr.dim is not None:
            g_param_shape[attr.dim] = shape[attr.dim]
        v_param_attr = copy.deepcopy(attr)
        v_param_attr.name = attr.name + '_v'
        v_param_shape = shape

        # Add to startup_program to initialize g and v.
        # Try to reconstruct the initializer of w by initializing g and v.
        # Set the initializers of g and v as below, then the distribution
        # of w is the same as initializing w with the given initializer.
        # For Data-Dependent Initialization, please compute the init-values
        # of g and v in external and then feed the values to g and v by
        # executing an extra program.
        g_param = self.startup_program.global_block().create_parameter(
            dtype=dtype,
            shape=g_param_shape,
304 305
            **g_param_attr._to_kwargs(with_initializer=False)
        )
306 307 308
        v_param = self.startup_program.global_block().create_parameter(
            dtype=dtype,
            shape=v_param_shape,
309 310 311 312 313 314 315 316
            **v_param_attr._to_kwargs(with_initializer=True)
        )
        __norm_except_dim(
            x=v_param,
            out=g_param,
            dim=attr.dim,
            block=self.startup_program.global_block(),
        )
317

318
        # keep g_param shape to be consistent with that in main_program
319 320 321 322 323 324
        __reshape_op(
            g_param,
            g_param_shape,
            out=g_param,
            block=self.startup_program.global_block(),
        )
325

326 327
        # Add weight normalization to main_program
        g_param = self.main_program.global_block().create_parameter(
328 329
            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()
        )
330
        v_param = self.main_program.global_block().create_parameter(
331 332
            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs()
        )
333 334 335 336
        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
        return w_param

    # TODO: hide the func after we move the layers to Layers
337 338 339 340 341 342 343 344 345 346
    def create_parameter(
        self,
        attr,
        shape,
        dtype=None,
        is_bias=False,
        default_initializer=None,
        stop_gradient=False,
        type=core.VarDesc.VarType.LOD_TENSOR,
    ):
347 348 349 350
        """Create parameters for this layers.

           Args:
               attr: [ParamAttr] should be the parameter attribute for this parameter
T
tianshuo78520a 已提交
351
               shape: shape of the parameter
352 353 354 355 356 357 358 359
               dtype: data type of this parameter
               is_bias: if this is a bias parameter
               default_initializer: set the default initializer for this parameter

        Returns created parameter Variable.
        """
        # Deepcopy the attr so that parameters can be shared in program
        attr = copy.deepcopy(attr)
360
        attr = ParamAttr._to_attr(attr)
361 362
        if not attr:
            return None
363
        assert isinstance(attr, ParamAttr)
364
        for i, size in enumerate(shape):
365 366 367 368
            assert size > 0, (
                "Expected every dim's size to be larger than 0, "
                "but the size of the {}-th dim is {}".format(i, size)
            )
369 370 371
        # set global dtype
        if not dtype:
            dtype = self.__dtype
372 373
        if is_bias:
            suffix = 'b'
374 375 376 377 378
            default_initializer = (
                _global_bias_initializer()
                if _global_bias_initializer() is not None
                else default_initializer
            )
379 380
        else:
            suffix = 'w'
381 382 383 384 385
            default_initializer = (
                _global_weight_initializer()
                if _global_weight_initializer() is not None
                else default_initializer
            )
386 387 388 389 390
        if attr.name is None:
            attr.name = unique_name.generate(".".join([self.name, suffix]))

        if default_initializer is None and attr.initializer is None:
            if isinstance(dtype, core.VarDesc.VarType):
391 392 393 394 395 396
                if (
                    dtype != core.VarDesc.VarType.FP32
                    and dtype != core.VarDesc.VarType.FP64
                    and dtype != core.VarDesc.VarType.FP16
                    and dtype != core.VarDesc.VarType.BF16
                ):
397 398 399 400
                    raise TypeError(
                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                    )
            else:
401 402 403
                if not (
                    dtype.startswith("float") or dtype in ["double", "uint16"]
                ):
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
                    raise TypeError(
                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                    )
            if is_bias:
                attr._set_default_bias_initializer()
            else:
                attr._set_default_param_initializer()
        else:
            attr._set_default_initializer(default_initializer)

        # If weight normalization is set, insert extra parameters and ops.
        # Refer to https://arxiv.org/pdf/1602.07868.pdf
        if isinstance(attr, WeightNormParamAttr):
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
J
Jiabin Yang 已提交
420
        if _non_static_mode():
L
lujun 已提交
421
            # In dygraph mode, we want the returned parameter to be
422
            # initialized so that it can be used imperatively.
H
hong 已提交
423 424 425 426 427 428 429
            # check parameter name
            is_used = unique_name.dygraph_parameter_name_checker(attr.name)
            if is_used:
                raise ValueError(
                    "parameter name [{}] have be been used. "
                    "In dygraph mode, the name of parameter can't be same."
                    "Please check the parameter attr value passed to self.create_parameter or "
430 431
                    "constructor of dygraph Layers".format(attr.name)
                )
432 433 434
            return self.main_program.global_block().create_parameter(
                dtype=dtype,
                shape=shape,
435
                type=type,
436
                stop_gradient=stop_gradient,
437 438
                **attr._to_kwargs(with_initializer=True)
            )
439 440 441 442
        else:
            self.startup_program.global_block().create_parameter(
                dtype=dtype,
                shape=shape,
443
                type=type,
444 445
                **attr._to_kwargs(with_initializer=True)
            )
446
            return self.main_program.global_block().create_parameter(
447 448
                dtype=dtype, shape=shape, type=type, **attr._to_kwargs()
            )
449

450 451 452
    def create_variable_for_type_inference(
        self, dtype, stop_gradient=False, shape=None
    ):
453 454 455 456 457 458 459 460
        """Create a temporary variable that should be type inferred layer.

        Note:
            The default type will be set to LOD_TENSOR. However, when
            the var is used as operator output, its type will be updated
            based on operator's `VarTypeInference` implementation in
            infer_var_type.
        """
461 462 463
        # set global dtype
        if not dtype:
            dtype = self.__dtype
464
        return self.main_program.current_block().create_var(
465 466 467
            name=unique_name.generate_with_ignorable_key(
                ".".join([self.name, 'tmp'])
            ),
468
            dtype=dtype,
469
            shape=shape,
470 471
            type=core.VarDesc.VarType.LOD_TENSOR,
            persistable=False,
472 473
            stop_gradient=stop_gradient,
        )
474

475 476 477
    def create_sparse_variable_for_type_inference(
        self, dtype, stop_gradient=False, shape=None
    ):
478 479 480 481 482 483 484 485 486 487 488 489
        """Create a temporary sparse variable that should be type inferred layer.

        Note:
            The default type will be set to SPARSE_COO. However, when
            the var is used as operator output, its type will be updated
            based on operator's `VarTypeInference` implementation in
            infer_var_type.
        """
        # set global dtype
        if not dtype:
            dtype = self.__dtype
        return self.main_program.current_block().create_var(
490 491 492
            name=unique_name.generate_with_ignorable_key(
                ".".join([self.name, 'tmp'])
            ),
493 494 495 496
            dtype=dtype,
            shape=shape,
            type=core.VarDesc.VarType.SPARSE_COO,
            persistable=False,
497 498
            stop_gradient=stop_gradient,
        )
499

500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
    def create_variable(self, *args, **kwargs):
        """Create Variable for this layers.
        Returns created Variable.
        """
        return self.main_program.current_block().create_var(*args, **kwargs)

    def create_global_variable(self, persistable=False, *args, **kwargs):
        """
        create global variable, note that there is no initializer for this global variable.
        Args:
            persistable(bool): True if it is a checkpoint value.
            *args: See create_var's documentation
            **kwargs: See create_var's documentation

        Returns(Variable): the created variable.
        """
        return self.main_program.global_block().create_var(
517 518
            *args, persistable=persistable, **kwargs
        )
519 520 521 522 523 524 525 526 527 528 529 530 531 532

    def create_or_get_global_variable(self, name, *args, **kwargs):
        """
        Creates a global variable if not exists and returns the variable and
        a boolean flag which is true when it is a new variable.
        """
        if self.main_program.global_block().has_var(name):
            return self.main_program.global_block().var(name), False
        else:
            return self.create_global_variable(name=name, *args, **kwargs), True

    def set_variable_initializer(self, var, initializer):
        """Set target Variable's initializer

533 534 535
        Args:
            var: target Variable
            initializer: initializer to use
536 537
        """
        assert isinstance(var, Variable)
J
Jiabin Yang 已提交
538
        if _non_static_mode():
539
            initializer(var, self.main_program.global_block())
540 541 542 543 544 545 546
        else:
            self.startup_program.global_block().create_var(
                name=var.name,
                type=var.type,
                dtype=var.dtype,
                shape=var.shape,
                persistable=True,
547 548
                initializer=initializer,
            )