[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)

* fix conflict * improve the doc

[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)
* fix conflict * improve the doc
1aafc31b · Jackwaterveg · GitHub · 4dcfc6df · 1aafc31b · 1aafc31b
隐藏空白更改
内联并排

Showing with 69 addition and 44 deletion

python/paddle/fluid/initializer.py python/paddle/fluid/initializer.py +45 -32

python/paddle/nn/initializer/kaiming.py python/paddle/nn/initializer/kaiming.py +24 -12

未找到文件。
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -679,20 +679,23 @@ class MSRAInitializer(Initializer):
    .. math::
-        x = \sqrt{\\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
    In case of Normal distribution, the mean is 0 and the standard deviation
    is
    .. math::
-        \sqrt{\\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}
    Args:
        uniform (bool): whether to use uniform or normal distribution
-        fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
-        inferred from the variable. default is None.
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
        seed (int32): random seed
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
    Note:
        It is recommended to set fan_in to None for most cases.
@@ -709,7 +712,12 @@ class MSRAInitializer(Initializer):
    """
-    def __init__(self, uniform=True, fan_in=None, seed=0):
+    def __init__(self,
+                 uniform=True,
+                 fan_in=None,
+                 seed=0,
+                 negative_slope=0,
+                 nonlinearity='relu'):
        """Constructor for MSRAInitializer
        """
        assert uniform is not None
@@ -718,6 +726,8 @@ class MSRAInitializer(Initializer):
        self._uniform = uniform
        self._fan_in = fan_in
        self._seed = seed
+        self._negative_slope = negative_slope
+        self._nonlinearity = nonlinearity
    def __call__(self, var, block=None):
        """Initialize the input tensor with MSRA initialization.
@@ -759,13 +769,16 @@ class MSRAInitializer(Initializer):
        if framework._non_static_mode():
            if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                -limit, 'max', limit, 'seed',
                                                self._seed, 'dtype',
                                                int(out_dtype))
            else:
-                std = math.sqrt(2.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
                if in_dygraph_mode():
                    place = _current_expected_place()
                    out_var = _C_ops.final_state_gaussian_random(
@@ -786,33 +799,33 @@ class MSRAInitializer(Initializer):
            return None
        else:
            if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                op = block.append_op(
+                limit = gain * math.sqrt(3.0 / float(fan_in))
-                    type="uniform_random",
+                op = block.append_op(type="uniform_random",
-                    inputs={},
+                                     inputs={},
-                    outputs={"Out": out_var},
+                                     outputs={"Out": out_var},
-                    attrs={
+                                     attrs={
-                        "shape": out_var.shape,
+                                         "shape": out_var.shape,
-                        "dtype": int(out_dtype),
+                                         "dtype": int(out_dtype),
-                        "min": -limit,
+                                         "min": -limit,
-                        "max": limit,
+                                         "max": limit,
-                        "seed": self._seed
+                                         "seed": self._seed
-                    },
+                                     },
-                    stop_gradient=True)
+                                     stop_gradient=True)
            else:
-                std = np.sqrt(2.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                op = block.append_op(
+                std = gain / math.sqrt(float(fan_in))
-                    type="gaussian_random",
+                op = block.append_op(type="gaussian_random",
-                    outputs={"Out": out_var},
+                                     outputs={"Out": out_var},
-                    attrs={
+                                     attrs={
-                        "shape": out_var.shape,
+                                         "shape": out_var.shape,
-                        "dtype": int(out_dtype),
+                                         "dtype": int(out_dtype),
-                        "mean": 0.0,
+                                         "mean": 0.0,
-                        "std": std,
+                                         "std": std,
-                        "seed": self._seed
+                                         "seed": self._seed
-                    },
+                                     },
-                    stop_gradient=True)
+                                     stop_gradient=True)
            if var.dtype == VarDesc.VarType.FP16 or (
                    var.dtype == VarDesc.VarType.BF16 and not self._uniform):

--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer):
    .. math::
-        \sqrt{\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}
    Args:
-        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
-        inferred from the variable. default is None.
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
    Note:
        It is recommended to set fan_in to None for most cases.
@@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer):
    """
-    def __init__(self, fan_in=None):
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
-        super(KaimingNormal, self).__init__(
+        super(KaimingNormal, self).__init__(uniform=False,
-            uniform=False, fan_in=fan_in, seed=0)
+                                            fan_in=fan_in,
+                                            seed=0,
+                                            negative_slope=negative_slope,
+                                            nonlinearity=nonlinearity)
 class KaimingUniform(MSRAInitializer):
@@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer):
    .. math::
-        x = \sqrt{\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
    Args:
-        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
-        inferred from the variable. default is None.
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
    Note:
        It is recommended to set fan_in to None for most cases.
@@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer):
    """
-    def __init__(self, fan_in=None):
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
-        super(KaimingUniform, self).__init__(
+        super(KaimingUniform, self).__init__(uniform=True,
-            uniform=True, fan_in=fan_in, seed=0)
+                                             fan_in=fan_in,
+                                             seed=0,
+                                             negative_slope=negative_slope,
+                                             nonlinearity=nonlinearity)