未验证 提交 4da1c4f1 编写于 作者: S SunGaofeng 提交者: GitHub

fix g_param shape mismatch in WeightNormParamAttr (#18940)

* fix g_param shape mismatch in WeightNormParamAttr

* add comment to show why insert reshape in startup_program
test=develop
上级 af63b118
...@@ -167,7 +167,7 @@ paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=N ...@@ -167,7 +167,7 @@ paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=N
paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '74498d37dd622ac472cb36887fce09ea')) paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '74498d37dd622ac472cb36887fce09ea'))
paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=None), ('document', '37663c7c179e920838a250ea0e28d909')) paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=None), ('document', '37663c7c179e920838a250ea0e28d909'))
paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d')) paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d'))
paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f189f8ef61f1c23779e1593b78755c0')) paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '36b6e58678956585e5b30aa3de123a60'))
paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f')) paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f'))
paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf')) paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf'))
paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f')) paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f'))
...@@ -994,7 +994,7 @@ paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('documen ...@@ -994,7 +994,7 @@ paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('documen
paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'fa47fa251f727c4a4f638d61e3c7c141')) paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'fa47fa251f727c4a4f638d61e3c7c141'))
paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', '48ab4f49c7eeeade5958b731b6a96aa0')) paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'b5ae1698ea72d5a9428000b916a67379'))
paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'a39802654f20692ad49c340cef7c6556')) paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'a39802654f20692ad49c340cef7c6556'))
paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -177,19 +177,24 @@ class LayerHelperBase(object): ...@@ -177,19 +177,24 @@ class LayerHelperBase(object):
elif dim == 0: elif dim == 0:
out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1) out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block) reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
norm = __norm_op(reshape, dim=1, block=block) norm = __norm_op(reshape, dim=[1], block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block) __reshape_op(norm, out=out, shape=out_shape, block=block)
elif dim == len(x.shape) - 1: elif dim == len(x.shape) - 1:
out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]] out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block) reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
norm = __norm_op(reshape, dim=0, block=block) norm = __norm_op(reshape, dim=[0], block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block) __reshape_op(norm, out=out, shape=out_shape, block=block)
else: else:
perm = list(range(len(x.shape))) perm = list(range(len(x.shape)))
perm[0], perm[dim] = dim, 0 perm[0], perm[dim] = dim, 0
transpose = __transpose_op(x, perm, block=block) transpose = __transpose_op(x, perm, block=block)
norm = __norm_op(transpose, dim=0, block=block) out_shape = [transpose.shape[0]] + [1] * (len(transpose.shape) -
__transpose_op(norm, perm, out=out, block=block) 1)
reshape = __reshape_op(
transpose, shape=[transpose.shape[0], -1], block=block)
norm = __norm_op(reshape, dim=[1], block=block)
reshape2 = __reshape_op(norm, shape=out_shape, block=block)
__transpose_op(reshape2, perm, out=out, block=block)
return out return out
def __weight_normalize(g, v, dim): def __weight_normalize(g, v, dim):
...@@ -240,6 +245,13 @@ class LayerHelperBase(object): ...@@ -240,6 +245,13 @@ class LayerHelperBase(object):
dim=attr.dim, dim=attr.dim,
block=self.startup_program.global_block()) block=self.startup_program.global_block())
# keep g_param shape to be consistent with that in main_program
__reshape_op(
g_param,
g_param_shape,
out=g_param,
block=self.startup_program.global_block())
# Add weight normalization to main_program # Add weight normalization to main_program
g_param = self.main_program.global_block().create_parameter( g_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()) dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
......
...@@ -7300,9 +7300,9 @@ def pad(x, paddings, pad_value=0., name=None): ...@@ -7300,9 +7300,9 @@ def pad(x, paddings, pad_value=0., name=None):
padded width is specified by :attr:`paddings`. padded width is specified by :attr:`paddings`.
Specifically, the number of values padded before the contents of :attr:`x` Specifically, the number of values padded before the contents of :attr:`x`
in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number in dimension :attr:`i` is indicated by :attr:`paddings[2i]`, and the number
of values padded after the contents of :attr:`x` in dimension :attr:`i` is of values padded after the contents of :attr:`x` in dimension :attr:`i` is
indicated by :attr:`paddings[i+1]`. indicated by :attr:`paddings[2i+1]`.
See below for an example. See below for an example.
......
...@@ -180,14 +180,14 @@ class ParamAttr(object): ...@@ -180,14 +180,14 @@ class ParamAttr(object):
class WeightNormParamAttr(ParamAttr): class WeightNormParamAttr(ParamAttr):
""" """
Used for weight Norm. Weight Norm is a reparameterization of the weight vectors Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
in a neural network that decouples the length of those weight vectors from in a neural network that decouples the magnitude of those weight vectors from
their direction. Weight Norm has been implemented as discussed in this their direction. Weight Norm has been implemented as discussed in this
paper: `Weight Normalization: A Simple Reparameterization to Accelerate paper: `Weight Normalization: A Simple Reparameterization to Accelerate
Training of Deep Neural Networks Training of Deep Neural Networks
<https://arxiv.org/pdf/1602.07868.pdf>`_. <https://arxiv.org/pdf/1602.07868.pdf>`_.
Args: Args:
dim(list): The parameter's name. Default None. dim(int): Dimension over which to compute the norm. Default None.
name(str): The parameter's name. Default None. name(str): The parameter's name. Default None.
initializer(Initializer): The method to initial this parameter. Default None. initializer(Initializer): The method to initial this parameter. Default None.
learning_rate(float): The parameter's learning rate. The learning rate when learning_rate(float): The parameter's learning rate. The learning rate when
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册