diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 5177d752a15fe98d0988a1749ba0d73f2b8bf4a1..d867ac3b0eb239443ec9e0847234310b8a5bd094 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -679,20 +679,23 @@ class MSRAInitializer(Initializer): .. math:: - x = \sqrt{\\frac{6.0}{fan\_in}} + x = gain \times \sqrt{\frac{3}{fan\_in}} In case of Normal distribution, the mean is 0 and the standard deviation is .. math:: - \sqrt{\\frac{2.0}{fan\_in}} + \frac{gain}{\sqrt{{fan\_in}}} Args: uniform (bool): whether to use uniform or normal distribution - fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\ - inferred from the variable. default is None. + fan_in (float32|None): fan_in (in_features) of trainable Tensor,\ + If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\ + you can set the value of 'fan_in' smartly by yourself. default is None. seed (int32): random seed + negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0. + nonlinearity(str, optional): the non-linear function. default is relu. Note: It is recommended to set fan_in to None for most cases. @@ -709,7 +712,12 @@ class MSRAInitializer(Initializer): """ - def __init__(self, uniform=True, fan_in=None, seed=0): + def __init__(self, + uniform=True, + fan_in=None, + seed=0, + negative_slope=0, + nonlinearity='relu'): """Constructor for MSRAInitializer """ assert uniform is not None @@ -718,6 +726,8 @@ class MSRAInitializer(Initializer): self._uniform = uniform self._fan_in = fan_in self._seed = seed + self._negative_slope = negative_slope + self._nonlinearity = nonlinearity def __call__(self, var, block=None): """Initialize the input tensor with MSRA initialization. @@ -759,13 +769,16 @@ class MSRAInitializer(Initializer): if framework._non_static_mode(): if self._uniform: - limit = np.sqrt(6.0 / float(fan_in)) + gain = calculate_gain(self._nonlinearity, self._negative_slope) + limit = gain * math.sqrt(3.0 / float(fan_in)) + out_var = _C_ops.uniform_random('shape', out_var.shape, 'min', -limit, 'max', limit, 'seed', self._seed, 'dtype', int(out_dtype)) else: - std = math.sqrt(2.0 / float(fan_in)) + gain = calculate_gain(self._nonlinearity, self._negative_slope) + std = gain / math.sqrt(float(fan_in)) if in_dygraph_mode(): place = _current_expected_place() out_var = _C_ops.final_state_gaussian_random( @@ -786,33 +799,33 @@ class MSRAInitializer(Initializer): return None else: if self._uniform: - limit = np.sqrt(6.0 / float(fan_in)) - op = block.append_op( - type="uniform_random", - inputs={}, - outputs={"Out": out_var}, - attrs={ - "shape": out_var.shape, - "dtype": int(out_dtype), - "min": -limit, - "max": limit, - "seed": self._seed - }, - stop_gradient=True) + gain = calculate_gain(self._nonlinearity, self._negative_slope) + limit = gain * math.sqrt(3.0 / float(fan_in)) + op = block.append_op(type="uniform_random", + inputs={}, + outputs={"Out": out_var}, + attrs={ + "shape": out_var.shape, + "dtype": int(out_dtype), + "min": -limit, + "max": limit, + "seed": self._seed + }, + stop_gradient=True) else: - std = np.sqrt(2.0 / float(fan_in)) - op = block.append_op( - type="gaussian_random", - outputs={"Out": out_var}, - attrs={ - "shape": out_var.shape, - "dtype": int(out_dtype), - "mean": 0.0, - "std": std, - "seed": self._seed - }, - stop_gradient=True) + gain = calculate_gain(self._nonlinearity, self._negative_slope) + std = gain / math.sqrt(float(fan_in)) + op = block.append_op(type="gaussian_random", + outputs={"Out": out_var}, + attrs={ + "shape": out_var.shape, + "dtype": int(out_dtype), + "mean": 0.0, + "std": std, + "seed": self._seed + }, + stop_gradient=True) if var.dtype == VarDesc.VarType.FP16 or ( var.dtype == VarDesc.VarType.BF16 and not self._uniform): diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py index 88a52268776fcb06e0f0fae6c1fb0886efe18340..ef34bc1785fde14788a85d45cad19ad0d04b2c26 100644 --- a/python/paddle/nn/initializer/kaiming.py +++ b/python/paddle/nn/initializer/kaiming.py @@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer): .. math:: - \sqrt{\frac{2.0}{fan\_in}} + \frac{gain}{\sqrt{{fan\_in}}} Args: - fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\ - inferred from the variable. default is None. + fan_in (float32|None): fan_in (in_features) of trainable Tensor,\ + If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\ + you can set the value of 'fan_in' smartly by yourself. default is None. + negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0. + nonlinearity(str, optional): the non-linear function. default is relu. Note: It is recommended to set fan_in to None for most cases. @@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer): """ - def __init__(self, fan_in=None): - super(KaimingNormal, self).__init__( - uniform=False, fan_in=fan_in, seed=0) + def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'): + super(KaimingNormal, self).__init__(uniform=False, + fan_in=fan_in, + seed=0, + negative_slope=negative_slope, + nonlinearity=nonlinearity) class KaimingUniform(MSRAInitializer): @@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer): .. math:: - x = \sqrt{\frac{6.0}{fan\_in}} + x = gain \times \sqrt{\frac{3}{fan\_in}} Args: - fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ - inferred from the variable. default is None. + fan_in (float32|None): fan_in (in_features) of trainable Tensor,\ + If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\ + you can set the value of 'fan_in' smartly by yourself. default is None. + negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0. + nonlinearity(str, optional): the non-linear function. default is relu. Note: It is recommended to set fan_in to None for most cases. @@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer): """ - def __init__(self, fan_in=None): - super(KaimingUniform, self).__init__( - uniform=True, fan_in=fan_in, seed=0) + def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'): + super(KaimingUniform, self).__init__(uniform=True, + fan_in=fan_in, + seed=0, + negative_slope=negative_slope, + nonlinearity=nonlinearity)